flock 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -76,17 +76,19 @@ Provides bindings to K-Means clustering in Cluster 3.0
76
76
  require 'flock'
77
77
 
78
78
  data = []
79
- data << { apple: 1, orange: 1 }
80
- data << { black: 1, white: 1 }
81
- data << { white: 1, cyan: 1 }
82
- data << { orange: 1 }
83
- data << { apple: 1 }
79
+
80
+ # keys don't need to be numeric
81
+ data << { 1 => 0.5, 2 => 0.5 }
82
+ data << { 3 => 1, 4 => 1 }
83
+ data << { 4 => 1, 5 => 0.3 }
84
+ data << { 2 => 0.75 }
85
+ data << { 1 => 0.60 }
84
86
 
85
87
  pp Flock.sparse_kmeans(2, data)
86
88
 
87
- # or even more simply (defaults to 1)
88
-
89
89
  data = []
90
+
91
+ # a much simpler way to cluster text
90
92
  data << %w(apple orange)
91
93
  data << %w(black white)
92
94
  data << %w(white cyan)
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.1
1
+ 0.3.0
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'pp'
4
+ require 'flock'
5
+
6
+ data = []
7
+ data << %w(orange apple)
8
+ data << %w(black white)
9
+ data << %w(white cyan)
10
+ data << %w(orange)
11
+ data << %w(apple)
12
+
13
+ pp Flock.sparse_self_organizing_map(2, 2, data)
@@ -4,11 +4,11 @@ require 'pp'
4
4
  require 'flock'
5
5
 
6
6
  data = []
7
- data << { apple: 1, orange: 1 }
8
- data << { black: 1, white: 1 }
9
- data << { white: 1, cyan: 1 }
10
- data << { orange: 1 }
11
- data << { apple: 1 }
7
+ data << { 1 => 0.5, 2 => 0.5 }
8
+ data << { 3 => 1, 4 => 1 }
9
+ data << { 4 => 1, 5 => 0.3 }
10
+ data << { 2 => 0.75 }
11
+ data << { 1 => 0.60 }
12
12
 
13
13
  pp Flock.sparse_kmeans(2, data)
14
14
 
@@ -5,7 +5,7 @@
5
5
  * Human Genome Center, Institute of Medical Science, University of Tokyo,
6
6
  * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
7
7
  * Contact: mdehoon 'AT' gsc.riken.jp
8
- *
8
+ *
9
9
  * Permission to use, copy, modify, and distribute this software and its
10
10
  * documentation with or without modifications and for any purpose and
11
11
  * without fee is hereby granted, provided that any copyright notices
@@ -14,7 +14,7 @@
14
14
  * names of the contributors or copyright holders not be used in
15
15
  * advertising or publicity pertaining to distribution of the software
16
16
  * without specific prior permission.
17
- *
17
+ *
18
18
  * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
19
19
  * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
20
20
  * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
@@ -23,7 +23,7 @@
23
23
  * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24
24
  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25
25
  * OR PERFORMANCE OF THIS SOFTWARE.
26
- *
26
+ *
27
27
  */
28
28
 
29
29
  #include <time.h>
@@ -334,7 +334,7 @@ static int svd(int m, int n, double** u, double w[], double** vt)
334
334
  * A=usv of a real m by n rectangular matrix, where m is greater
335
335
  * than or equal to n. Householder bidiagonalization and a variant
336
336
  * of the QR algorithm are used.
337
- *
337
+ *
338
338
  *
339
339
  * On input.
340
340
  *
@@ -929,10 +929,9 @@ positive integer if the singular value decomposition fails to converge.
929
929
 
930
930
  /* ********************************************************************* */
931
931
 
932
- static
933
932
  double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
934
933
  const double weight[], int index1, int index2, int transpose)
935
-
934
+
936
935
  /*
937
936
  Purpose
938
937
  =======
@@ -1004,7 +1003,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1004
1003
 
1005
1004
  /* ********************************************************************* */
1006
1005
 
1007
- static
1008
1006
  double cityblock (int n, double** data1, double** data2, int** mask1,
1009
1007
  int** mask2, const double weight[], int index1, int index2, int transpose)
1010
1008
 
@@ -1080,7 +1078,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1080
1078
 
1081
1079
  /* ********************************************************************* */
1082
1080
 
1083
- static
1084
1081
  double correlation (int n, double** data1, double** data2, int** mask1,
1085
1082
  int** mask2, const double weight[], int index1, int index2, int transpose)
1086
1083
  /*
@@ -1180,7 +1177,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1180
1177
 
1181
1178
  /* ********************************************************************* */
1182
1179
 
1183
- static
1184
1180
  double acorrelation (int n, double** data1, double** data2, int** mask1,
1185
1181
  int** mask2, const double weight[], int index1, int index2, int transpose)
1186
1182
  /*
@@ -1279,7 +1275,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1279
1275
 
1280
1276
  /* ********************************************************************* */
1281
1277
 
1282
- static
1283
1278
  double ucorrelation (int n, double** data1, double** data2, int** mask1,
1284
1279
  int** mask2, const double weight[], int index1, int index2, int transpose)
1285
1280
  /*
@@ -1374,7 +1369,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1374
1369
 
1375
1370
  /* ********************************************************************* */
1376
1371
 
1377
- static
1378
1372
  double uacorrelation (int n, double** data1, double** data2, int** mask1,
1379
1373
  int** mask2, const double weight[], int index1, int index2, int transpose)
1380
1374
  /*
@@ -1469,7 +1463,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1469
1463
 
1470
1464
  /* ********************************************************************* */
1471
1465
 
1472
- static
1473
1466
  double spearman (int n, double** data1, double** data2, int** mask1,
1474
1467
  int** mask2, const double weight[], int index1, int index2, int transpose)
1475
1468
  /*
@@ -1597,7 +1590,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
1597
1590
 
1598
1591
  /* ********************************************************************* */
1599
1592
 
1600
- static
1601
1593
  double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
1602
1594
  const double weight[], int index1, int index2, int transpose)
1603
1595
  /*
@@ -1708,7 +1700,7 @@ Otherwise, the distance between two columns in the matrix is calculated.
1708
1700
 
1709
1701
  /* ********************************************************************* */
1710
1702
 
1711
- static double(*setmetric(char dist))
1703
+ static double(*setmetric(char dist))
1712
1704
  (int, double**, double**, int**, int**, const double[], int, int, int)
1713
1705
  { switch(dist)
1714
1706
  { case 'e': return &euclid;
@@ -2203,7 +2195,7 @@ calculating the medians.
2203
2195
  }
2204
2196
  }
2205
2197
  }
2206
-
2198
+
2207
2199
  /* ********************************************************************* */
2208
2200
 
2209
2201
  int getclustercentroids(int nclusters, int nrows, int ncolumns,
@@ -2427,7 +2419,7 @@ kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2427
2419
  break; /* Identical solution found; break out of this loop */
2428
2420
  }
2429
2421
 
2430
- if (npass<=1)
2422
+ if (npass<=1)
2431
2423
  { *error = total;
2432
2424
  break;
2433
2425
  }
@@ -2532,7 +2524,7 @@ kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2532
2524
  break; /* Identical solution found; break out of this loop */
2533
2525
  }
2534
2526
 
2535
- if (npass<=1)
2527
+ if (npass<=1)
2536
2528
  { *error = total;
2537
2529
  break;
2538
2530
  }
@@ -2603,7 +2595,7 @@ of the matrix are clustered.
2603
2595
 
2604
2596
  npass (input) int
2605
2597
  The number of times clustering is performed. Clustering is performed npass
2606
- times, each time starting from a different (random) initial assignment of
2598
+ times, each time starting from a different (random) initial assignment of
2607
2599
  genes to clusters. The clustering solution with the lowest within-cluster sum
2608
2600
  of distances is chosen.
2609
2601
  If npass==0, then the clustering algorithm will be run once, where the initial
@@ -2697,7 +2689,7 @@ number of clusters is larger than the number of elements being clustered,
2697
2689
  return;
2698
2690
  }
2699
2691
  }
2700
-
2692
+
2701
2693
  if (method=='m')
2702
2694
  { double* cache = malloc(nelements*sizeof(double));
2703
2695
  if(cache)
@@ -3105,7 +3097,7 @@ weights array, the function returns NULL.
3105
3097
 
3106
3098
  /* ******************************************************************** */
3107
3099
 
3108
- void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
3100
+ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
3109
3101
 
3110
3102
  /*
3111
3103
  Purpose
@@ -3160,7 +3152,7 @@ error occured, all elements in clusterid are set to -1.
3160
3152
  }
3161
3153
  for (i = 0; i < n; i++) nodeid[i] = -1;
3162
3154
  for (i = n-1; i >= 0; i--)
3163
- { if(nodeid[i]<0)
3155
+ { if(nodeid[i]<0)
3164
3156
  { j = icluster;
3165
3157
  nodeid[i] = j;
3166
3158
  icluster++;
@@ -3269,7 +3261,7 @@ If a memory error occurs, pclcluster returns NULL.
3269
3261
  if(!makedatamask(nelements, ndata, &newdata, &newmask))
3270
3262
  { free(result);
3271
3263
  free(distid);
3272
- return NULL;
3264
+ return NULL;
3273
3265
  }
3274
3266
 
3275
3267
  for (i = 0; i < nelements; i++) distid[i] = i;
@@ -3313,7 +3305,7 @@ If a memory error occurs, pclcluster returns NULL.
3313
3305
  free(mask[is]);
3314
3306
  data[is] = data[nnodes-inode];
3315
3307
  mask[is] = mask[nnodes-inode];
3316
-
3308
+
3317
3309
  /* Fix the distances */
3318
3310
  distid[is] = distid[nnodes-inode];
3319
3311
  for (i = 0; i < is; i++)
@@ -3334,7 +3326,7 @@ If a memory error occurs, pclcluster returns NULL.
3334
3326
  free(data);
3335
3327
  free(mask);
3336
3328
  free(distid);
3337
-
3329
+
3338
3330
  return result;
3339
3331
  }
3340
3332
 
@@ -3829,7 +3821,7 @@ If a memory error occurs, treecluster returns NULL.
3829
3821
  for (i = 1; i < nelements; i++) free(distmatrix[i]);
3830
3822
  free (distmatrix);
3831
3823
  }
3832
-
3824
+
3833
3825
  return result;
3834
3826
  }
3835
3827
 
@@ -4037,7 +4029,7 @@ void somworker (int nrows, int ncolumns, double** data, int** mask,
4037
4029
  static
4038
4030
  void somassign (int nrows, int ncolumns, double** data, int** mask,
4039
4031
  const double weights[], int transpose, int nxgrid, int nygrid,
4040
- double*** celldata, char dist, int clusterid[][2])
4032
+ double*** celldata, char dist, int **clusterid)
4041
4033
  /* Collect clusterids */
4042
4034
  { const int ndata = (transpose==0) ? ncolumns : nrows;
4043
4035
  int i,j;
@@ -4121,7 +4113,7 @@ void somassign (int nrows, int ncolumns, double** data, int** mask,
4121
4113
 
4122
4114
  void somcluster (int nrows, int ncolumns, double** data, int** mask,
4123
4115
  const double weight[], int transpose, int nxgrid, int nygrid,
4124
- double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
4116
+ double inittau, int niter, char dist, double*** celldata, int **clusterid)
4125
4117
  /*
4126
4118
 
4127
4119
  Purpose
@@ -4235,7 +4227,7 @@ somcluster.
4235
4227
  double clusterdistance (int nrows, int ncolumns, double** data,
4236
4228
  int** mask, double weight[], int n1, int n2, int index1[], int index2[],
4237
4229
  char dist, char method, int transpose)
4238
-
4230
+
4239
4231
  /*
4240
4232
  Purpose
4241
4233
  =======
@@ -6,7 +6,7 @@
6
6
  * Human Genome Center, Institute of Medical Science, University of Tokyo,
7
7
  * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
8
8
  * Contact: mdehoon 'AT' gsc.riken.jp
9
- *
9
+ *
10
10
  * Permission to use, copy, modify, and distribute this software and its
11
11
  * documentation with or without modifications and for any purpose and
12
12
  * without fee is hereby granted, provided that any copyright notices
@@ -15,7 +15,7 @@
15
15
  * names of the contributors or copyright holders not be used in
16
16
  * advertising or publicity pertaining to distribution of the software
17
17
  * without specific prior permission.
18
- *
18
+ *
19
19
  * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
20
20
  * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
21
21
  * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
@@ -24,7 +24,7 @@
24
24
  * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
25
25
  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
26
26
  * OR PERFORMANCE OF THIS SOFTWARE.
27
- *
27
+ *
28
28
  */
29
29
 
30
30
  #ifndef min
@@ -79,7 +79,7 @@ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]);
79
79
  void somcluster (int nrows, int ncolumns, double** data, int** mask,
80
80
  const double weight[], int transpose, int nxnodes, int nynodes,
81
81
  double inittau, int niter, char dist, double*** celldata,
82
- int clusterid[][2]);
82
+ int **clusterid);
83
83
 
84
84
  /* Chapter 6 */
85
85
  int pca(int m, int n, double** u, double** v, double* w);
@@ -91,3 +91,13 @@ double median (int n, double x[]);
91
91
 
92
92
  double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
93
93
  double weights[], int transpose, char dist, double cutoff, double exponent);
94
+
95
+ /* distance functions */
96
+ extern double euclid (int, double**, double**, int**, int**, const double [], int, int, int);
97
+ extern double cityblock(int, double**, double**, int**, int**, const double [], int, int, int);
98
+ extern double correlation(int, double**, double**, int**, int**, const double [], int, int, int);
99
+ extern double acorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
100
+ extern double ucorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
101
+ extern double uacorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
102
+ extern double spearman(int, double**, double**, int**, int**, const double [], int, int, int);
103
+ extern double kendall(int, double**, double**, int**, int**, const double [], int, int, int);
@@ -5,6 +5,7 @@
5
5
  #define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
6
6
 
7
7
  static VALUE mFlock;
8
+ typedef double (*distance_fn)(int, double**, double**, int**, int**, const double [], int, int, int);
8
9
 
9
10
  int opt_int_value(VALUE option, char *key, int def) {
10
11
  if (NIL_P(option)) return def;
@@ -13,6 +14,13 @@ int opt_int_value(VALUE option, char *key, int def) {
13
14
  return NIL_P(value) ? def : NUM2INT(value);
14
15
  }
15
16
 
17
+ int opt_double_value(VALUE option, char *key, double def) {
18
+ if (NIL_P(option)) return def;
19
+
20
+ VALUE value = rb_hash_aref(option, ID2SYM(rb_intern(key)));
21
+ return NIL_P(value) ? def : NUM2DBL(value);
22
+ }
23
+
16
24
  VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
17
25
  VALUE size, data, mask, weights, options;
18
26
  rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
@@ -26,6 +34,20 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
26
34
  if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
27
35
  rb_raise(rb_eArgError, "size should be > 0 and <= data size");
28
36
 
37
+ int transpose = opt_int_value(options, "transpose", 0);
38
+ int npass = opt_int_value(options, "iterations", 1000);
39
+ // a = average, m = means
40
+ int method = opt_int_value(options, "method", 'a');
41
+ // e = euclidian,
42
+ // b = city-block distance
43
+ // c = correlation
44
+ // a = absolute value of the correlation
45
+ // u = uncentered correlation
46
+ // x = absolute uncentered correlation
47
+ // s = spearman's rank correlation
48
+ // k = kendall's tau
49
+ int dist = opt_int_value(options, "metric", 'e');
50
+
29
51
  int i,j;
30
52
  int nrows = RARRAY_LEN(data);
31
53
  int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
@@ -33,16 +55,14 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
33
55
 
34
56
  double **cdata = (double**)malloc(sizeof(double*)*nrows);
35
57
  int **cmask = (int **)malloc(sizeof(int *)*nrows);
36
- double **ccentroid = (double**)malloc(sizeof(double*)*nrows);
37
- int **ccentroid_mask = (int **)malloc(sizeof(int *)*nrows);
38
58
  double *cweights = (double *)malloc(sizeof(double )*ncols);
39
- int *ccluster = (int *)malloc(sizeof(int )*nrows);
59
+
60
+ double **ccentroid;
61
+ int *ccluster, **ccentroid_mask, dimx = nrows, dimy = ncols, cdimx = nsets, cdimy = ncols;
40
62
 
41
63
  for (i = 0; i < nrows; i++) {
42
64
  cdata[i] = (double*)malloc(sizeof(double)*ncols);
43
65
  cmask[i] = (int *)malloc(sizeof(int )*ncols);
44
- ccentroid[i] = (double*)malloc(sizeof(double)*ncols);
45
- ccentroid_mask[i] = (int *)malloc(sizeof(int )*ncols);
46
66
  for (j = 0; j < ncols; j++) {
47
67
  cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
48
68
  cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
@@ -54,25 +74,27 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
54
74
  cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
55
75
  }
56
76
 
57
- int transpose = opt_int_value(options, "transpose", 0);
58
- int npass = opt_int_value(options, "iterations", 1000);
59
- // a = average, m = means
60
- int method = opt_int_value(options, "method", 'a');
61
- // e = euclidian,
62
- // b = city-block distance
63
- // c = correlation
64
- // a = absolute value of the correlation
65
- // u = uncentered correlation
66
- // x = absolute uncentered correlation
67
- // s = spearman's rank correlation
68
- // k = kendall's tau
69
- int dist = opt_int_value(options, "metric", 'e');
77
+ if (transpose) {
78
+ dimx = ncols;
79
+ dimy = nrows;
80
+ cdimx = nrows;
81
+ cdimy = nsets;
82
+ }
83
+
84
+ ccluster = (int *)malloc(sizeof(int )*dimx);
85
+ ccentroid = (double**)malloc(sizeof(double*)*cdimx);
86
+ ccentroid_mask = (int **)malloc(sizeof(int *)*cdimx);
87
+
88
+ for (i = 0; i < cdimx; i++) {
89
+ ccentroid[i] = (double*)malloc(sizeof(double)*cdimy);
90
+ ccentroid_mask[i] = (int *)malloc(sizeof(int )*cdimy);
91
+ }
70
92
 
71
93
  int ifound;
72
94
  double error;
95
+
73
96
  kcluster(nsets,
74
97
  nrows, ncols, cdata, cmask, cweights, transpose, npass, method, dist, ccluster, &error, &ifound);
75
-
76
98
  getclustercentroids(nsets,
77
99
  nrows, ncols, cdata, cmask, ccluster, ccentroid, ccentroid_mask, transpose, method);
78
100
 
@@ -80,10 +102,12 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
80
102
  VALUE cluster = rb_ary_new();
81
103
  VALUE centroid = rb_ary_new();
82
104
 
83
- for (i = 0; i < nrows; i++) {
105
+ for (i = 0; i < dimx; i++)
84
106
  rb_ary_push(cluster, INT2NUM(ccluster[i]));
107
+
108
+ for (i = 0; i < cdimx; i++) {
85
109
  VALUE point = rb_ary_new();
86
- for (j = 0; j < ncols; j++)
110
+ for (j = 0; j < cdimy; j++)
87
111
  rb_ary_push(point, DBL2NUM(ccentroid[i][j]));
88
112
  rb_ary_push(centroid, point);
89
113
  }
@@ -96,6 +120,9 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
96
120
  for (i = 0; i < nrows; i++) {
97
121
  free(cdata[i]);
98
122
  free(cmask[i]);
123
+ }
124
+
125
+ for (i = 0; i < cdimx; i++) {
99
126
  free(ccentroid[i]);
100
127
  free(ccentroid_mask[i]);
101
128
  }
@@ -110,9 +137,203 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
110
137
  return result;
111
138
  }
112
139
 
140
+ VALUE rb_som(int argc, VALUE *argv, VALUE self) {
141
+ VALUE nx, ny, data, mask, weights, options;
142
+ rb_scan_args(argc, argv, "32", &nx, &ny, &data, &mask, &options);
143
+
144
+ if (TYPE(data) != T_ARRAY)
145
+ rb_raise(rb_eArgError, "data should be an array of arrays");
146
+
147
+ if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
148
+ rb_raise(rb_eArgError, "mask should be an array of arrays");
149
+
150
+ if (NIL_P(nx) || NUM2INT(rb_Integer(nx)) <= 0)
151
+ rb_raise(rb_eArgError, "nx should be > 0");
152
+
153
+ if (NIL_P(ny) || NUM2INT(rb_Integer(ny)) <= 0)
154
+ rb_raise(rb_eArgError, "ny should be > 0");
155
+
156
+ int nxgrid = NUM2INT(rb_Integer(nx));
157
+ int nygrid = NUM2INT(rb_Integer(ny));
158
+ int transpose = opt_int_value(options, "transpose", 0);
159
+ int npass = opt_int_value(options, "iterations", 1000);
160
+
161
+ // e = euclidian,
162
+ // b = city-block distance
163
+ // c = correlation
164
+ // a = absolute value of the correlation
165
+ // u = uncentered correlation
166
+ // x = absolute uncentered correlation
167
+ // s = spearman's rank correlation
168
+ // k = kendall's tau
169
+ int dist = opt_int_value(options, "metric", 'e');
170
+ double tau = opt_double_value(options, "tau", 1.0);
171
+
172
+ int i, j, k;
173
+ int nrows = RARRAY_LEN(data);
174
+ int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
175
+
176
+ double **cdata = (double**)malloc(sizeof(double*)*nrows);
177
+ int **cmask = (int **)malloc(sizeof(int *)*nrows);
178
+ double *cweights = (double *)malloc(sizeof(double )*ncols);
179
+
180
+ int **ccluster;
181
+ double ***ccelldata;
182
+ int dimx = nrows, dimy = ncols;
183
+
184
+ if (transpose) {
185
+ dimx = ncols;
186
+ dimy = nrows;
187
+ }
188
+
189
+ ccluster = (int **)malloc(sizeof(int*)*dimx);
190
+ for (i = 0; i < dimx; i++)
191
+ ccluster[i] = (int*)malloc(sizeof(int)*2);
192
+
193
+ for (i = 0; i < nrows; i++) {
194
+ cdata[i] = (double*)malloc(sizeof(double)*ncols);
195
+ cmask[i] = (int *)malloc(sizeof(int )*ncols);
196
+ for (j = 0; j < ncols; j++) {
197
+ cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
198
+ cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
199
+ }
200
+ }
201
+
202
+ weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
203
+ for (i = 0; i < ncols; i++) {
204
+ cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
205
+ }
206
+
207
+ ccelldata = (double***)malloc(sizeof(double**)*nxgrid);
208
+ for (i = 0; i < nxgrid; i++) {
209
+ ccelldata[i] = (double **)malloc(sizeof(double*)*nygrid);
210
+ for (j = 0; j < nygrid; j++)
211
+ ccelldata[i][j] = (double *)malloc(sizeof(double)*dimy);
212
+ }
213
+
214
+ somcluster(nrows, ncols, cdata, cmask, cweights, transpose,
215
+ nxgrid, nygrid, tau, npass, dist, ccelldata, ccluster);
216
+
217
+ VALUE result = rb_hash_new();
218
+ VALUE cluster = rb_ary_new();
219
+ VALUE centroid = rb_ary_new();
220
+
221
+ for (i = 0; i < dimx; i++)
222
+ rb_ary_push(cluster, INT2NUM(ccluster[i][0] * nxgrid + ccluster[i][1]));
223
+
224
+ for (i = 0; i < nxgrid; i++) {
225
+ for (j = 0; j < nygrid; j++) {
226
+ VALUE point = rb_ary_new();
227
+ for (k = 0; k < dimy; k++)
228
+ rb_ary_push(point, DBL2NUM(ccelldata[i][j][k]));
229
+ rb_ary_push(centroid, point);
230
+ }
231
+ }
232
+
233
+ rb_hash_aset(result, ID2SYM(rb_intern("cluster")), cluster);
234
+ rb_hash_aset(result, ID2SYM(rb_intern("centroid")), centroid);
235
+
236
+ for (i = 0; i < nrows; i++) {
237
+ free(cdata[i]);
238
+ free(cmask[i]);
239
+ }
240
+
241
+ for (i = 0; i < dimx; i++)
242
+ free(ccluster[i]);
243
+
244
+ for (i = 0; i < nxgrid; i++) {
245
+ for (j = 0; j < nygrid; j++)
246
+ free(ccelldata[i][j]);
247
+ free(ccelldata[i]);
248
+ }
249
+
250
+ free(cdata);
251
+ free(cmask);
252
+ free(ccelldata);
253
+ free(cweights);
254
+ free(ccluster);
255
+
256
+ return result;
257
+ }
258
+
259
+
260
+ VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
261
+ uint32_t size;
262
+ double *data1, *data2, *weight, dist;
263
+ int *mask, i;
264
+
265
+ if (TYPE(vec1) != T_ARRAY)
266
+ rb_raise(rb_eArgError, "vector1 should be an array");
267
+
268
+ if (TYPE(vec2) != T_ARRAY)
269
+ rb_raise(rb_eArgError, "vector2 should be an array");
270
+
271
+ size = RARRAY_LEN(vec1);
272
+
273
+ if (size != RARRAY_LEN(vec2))
274
+ rb_raise(rb_eArgError, "vector1 & vector2 dimensions mismatch");
275
+
276
+ if (size < 1)
277
+ rb_raise(rb_eArgError, "dimension should be greater than 0");
278
+
279
+ data1 = (double *)malloc(sizeof(double)*size);
280
+ data2 = (double *)malloc(sizeof(double)*size);
281
+ weight = (double *)malloc(sizeof(double)*size);
282
+ mask = (int *)malloc(sizeof(int)*size);
283
+
284
+ for (i = 0; i < size; i++) {
285
+ mask[i] = 1;
286
+ weight[i] = 1;
287
+ data1[i] = NUM2DBL(rb_ary_entry(vec1, i));
288
+ data2[i] = NUM2DBL(rb_ary_entry(vec2, i));
289
+ }
290
+
291
+ dist = fn(size, &data1, &data2, &mask, &mask, weight, 0, 0, 0);
292
+ free(mask);
293
+ free(weight);
294
+ free(data2);
295
+ free(data1);
296
+
297
+ return DBL2NUM(dist);
298
+ }
299
+
300
+ VALUE rb_euclid(VALUE self, VALUE vec1, VALUE vec2) {
301
+ return rb_distance(vec1, vec2, euclid);
302
+ }
303
+
304
+ VALUE rb_cityblock(VALUE self, VALUE vec1, VALUE vec2) {
305
+ return rb_distance(vec1, vec2, cityblock);
306
+ }
307
+
308
+ VALUE rb_correlation(VALUE self, VALUE vec1, VALUE vec2) {
309
+ return rb_distance(vec1, vec2, correlation);
310
+ }
311
+
312
+ VALUE rb_ucorrelation(VALUE self, VALUE vec1, VALUE vec2) {
313
+ return rb_distance(vec1, vec2, ucorrelation);
314
+ }
315
+
316
+ VALUE rb_acorrelation(VALUE self, VALUE vec1, VALUE vec2) {
317
+ return rb_distance(vec1, vec2, acorrelation);
318
+ }
319
+
320
+ VALUE rb_uacorrelation(VALUE self, VALUE vec1, VALUE vec2) {
321
+ return rb_distance(vec1, vec2, uacorrelation);
322
+ }
323
+
324
+ VALUE rb_spearman(VALUE self, VALUE vec1, VALUE vec2) {
325
+ return rb_distance(vec1, vec2, spearman);
326
+ }
327
+
328
+ VALUE rb_kendall(VALUE self, VALUE vec1, VALUE vec2) {
329
+ return rb_distance(vec1, vec2, kendall);
330
+ }
331
+
332
+
113
333
  void Init_flock(void) {
114
334
  mFlock = rb_define_module("Flock");
115
335
  rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
336
+ rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
116
337
 
117
338
  rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
118
339
  rb_define_const(mFlock, "METHOD_MEDIAN", INT2NUM('m'));
@@ -125,4 +346,13 @@ void Init_flock(void) {
125
346
  rb_define_const(mFlock, "METRIC_ABSOLUTE_UNCENTERED_CORRELATION", INT2NUM('x'));
126
347
  rb_define_const(mFlock, "METRIC_SPEARMAN", INT2NUM('s'));
127
348
  rb_define_const(mFlock, "METRIC_KENDALL", INT2NUM('k'));
349
+
350
+ rb_define_module_function(mFlock, "euclidian_distance", RUBY_METHOD_FUNC(rb_euclid), 2);
351
+ rb_define_module_function(mFlock, "cityblock_distance", RUBY_METHOD_FUNC(rb_cityblock), 2);
352
+ rb_define_module_function(mFlock, "correlation_distance", RUBY_METHOD_FUNC(rb_correlation), 2);
353
+ rb_define_module_function(mFlock, "absolute_correlation_distance", RUBY_METHOD_FUNC(rb_acorrelation), 2);
354
+ rb_define_module_function(mFlock, "uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_ucorrelation), 2);
355
+ rb_define_module_function(mFlock, "absolute_uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_uacorrelation), 2);
356
+ rb_define_module_function(mFlock, "spearman_distance", RUBY_METHOD_FUNC(rb_spearman), 2);
357
+ rb_define_module_function(mFlock, "kendall_distance", RUBY_METHOD_FUNC(rb_kendall), 2);
128
358
  }
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{flock}
8
- s.version = "0.2.1"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Bharanee Rathna"]
12
- s.date = %q{2011-02-19}
12
+ s.date = %q{2011-04-24}
13
13
  s.description = %q{A thin ruby binding to Cluster 3.0}
14
14
  s.email = ["deepfryed@gmail.com"]
15
15
  s.extensions = ["ext/extconf.rb"]
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
34
34
  s.summary = %q{Ruby bindings to Cluster 3.0.}
35
35
  s.test_files = [
36
36
  "examples/sparse.rb",
37
+ "examples/som.rb",
37
38
  "examples/dense.rb"
38
39
  ]
39
40
 
@@ -21,7 +21,7 @@ module Flock
21
21
  [dims,data]
22
22
  end
23
23
 
24
- def self.sparse_kmeans size, sparse_data, options={}
24
+ def self.sparse_kmeans size, sparse_data, options = {}
25
25
  dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
26
26
 
27
27
  if options.key?(:weights)
@@ -32,4 +32,16 @@ module Flock
32
32
 
33
33
  kmeans(size, data, nil, options)
34
34
  end
35
+
36
+ def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
37
+ dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
38
+
39
+ if options.key?(:weights)
40
+ weights = Array.new(dims.size) {1}
41
+ options[:weights].each {|k,v| weights[dims[k]] = v }
42
+ options[:weights] = weights
43
+ end
44
+
45
+ self_organizing_map(nx, ny, data, nil, options)
46
+ end
35
47
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 2
8
- - 1
9
- version: 0.2.1
7
+ - 3
8
+ - 0
9
+ version: 0.3.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-02-19 00:00:00 +11:00
17
+ date: 2011-04-24 00:00:00 +10:00
18
18
  default_executable:
19
19
  dependencies: []
20
20
 
@@ -38,6 +38,7 @@ files:
38
38
  - flock.gemspec
39
39
  - lib/flock.rb
40
40
  - examples/sparse.rb
41
+ - examples/som.rb
41
42
  - examples/dense.rb
42
43
  has_rdoc: true
43
44
  homepage: http://github.com/deepfryed/flock
@@ -73,4 +74,5 @@ specification_version: 3
73
74
  summary: Ruby bindings to Cluster 3.0.
74
75
  test_files:
75
76
  - examples/sparse.rb
77
+ - examples/som.rb
76
78
  - examples/dense.rb