RubyGems - flock - Versions diffs - 0.2.1 → 0.3.0 - Mend

flock 0.2.1 → 0.3.0

Files changed (10) hide show

data/README.rdoc CHANGED

@@ -76,17 +76,19 @@ Provides bindings to K-Means clustering in Cluster 3.0
   require 'flock'
   data = []
-  data << { apple:  1, orange: 1 }
-  data << { black:  1, white:  1 }
-  data << { white:  1, cyan:   1 }
-  data << { orange: 1 }
-  data << { apple:  1 }
+  # keys don't need to be numeric
+  data << { 1 => 0.5, 2 => 0.5 }
+  data << { 3 => 1, 4 => 1 }
+  data << { 4 => 1, 5 => 0.3 }
+  data << { 2 => 0.75 }
+  data << { 1 => 0.60 }
   pp Flock.sparse_kmeans(2, data)
-  # or even more simply (defaults to 1)
   data = []
+  # a much simpler way to cluster text
   data << %w(apple orange)
   data << %w(black white)
   data << %w(white cyan)

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.2.1
1	+ 0.3.0

data/examples/som.rb ADDED

@@ -0,0 +1,13 @@
+#!/usr/bin/ruby
+require 'pp'
+require 'flock'
+data = []
+data << %w(orange apple)
+data << %w(black white)
+data << %w(white cyan)
+data << %w(orange)
+data << %w(apple)
+pp Flock.sparse_self_organizing_map(2, 2, data)

data/examples/sparse.rb CHANGED

@@ -4,11 +4,11 @@ require 'pp'
 require 'flock'
 data = []
-data << { apple:  1, orange: 1 }
-data << { black:  1, white:  1 }
-data << { white:  1, cyan:   1 }
-data << { orange: 1 }
-data << { apple:  1 }
+data << { 1 => 0.5, 2 => 0.5 }
+data << { 3 => 1, 4 => 1 }
+data << { 4 => 1, 5 => 0.3 }
+data << { 2 => 0.75 }
+data << { 1 => 0.60 }
 pp Flock.sparse_kmeans(2, data)

data/ext/cluster.c CHANGED

@@ -5,7 +5,7 @@
  * Human Genome Center, Institute of Medical Science, University of Tokyo,
  * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
  * Contact: mdehoon 'AT' gsc.riken.jp
- *
+ *
  * Permission to use, copy, modify, and distribute this software and its
  * documentation with or without modifications and for any purpose and
  * without fee is hereby granted, provided that any copyright notices
@@ -14,7 +14,7 @@
  * names of the contributors or copyright holders not be used in
  * advertising or publicity pertaining to distribution of the software
  * without specific prior permission.
- *
+ *
  * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
  * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
@@ -23,7 +23,7 @@
  * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
  * OR PERFORMANCE OF THIS SOFTWARE.
- *
+ *
  */
 #include <time.h>
@@ -334,7 +334,7 @@ static int svd(int m, int n, double** u, double w[], double** vt)
  *   A=usv  of a real m by n rectangular matrix, where m is greater
  *   than or equal to n.  Householder bidiagonalization and a variant
  *   of the QR algorithm are used.
- *
+ *
  *
  *   On input.
  *
@@ -929,10 +929,9 @@ positive integer if the singular value decomposition fails to converge.
 /* ********************************************************************* */
-static
 double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
   const double weight[], int index1, int index2, int transpose)
 /*
 Purpose
 =======
@@ -1004,7 +1003,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
 /* ********************************************************************* */
-static
 double cityblock (int n, double** data1, double** data2, int** mask1,
   int** mask2, const double weight[], int index1, int index2, int transpose)
@@ -1080,7 +1078,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
 /* ********************************************************************* */
-static
 double correlation (int n, double** data1, double** data2, int** mask1,
   int** mask2, const double weight[], int index1, int index2, int transpose)
 /*
@@ -1180,7 +1177,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
 /* ********************************************************************* */
-static
 double acorrelation (int n, double** data1, double** data2, int** mask1,
   int** mask2, const double weight[], int index1, int index2, int transpose)
 /*
@@ -1279,7 +1275,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
 /* ********************************************************************* */
-static
 double ucorrelation (int n, double** data1, double** data2, int** mask1,
   int** mask2, const double weight[], int index1, int index2, int transpose)
 /*
@@ -1374,7 +1369,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
 /* ********************************************************************* */
-static
 double uacorrelation (int n, double** data1, double** data2, int** mask1,
   int** mask2, const double weight[], int index1, int index2, int transpose)
 /*
@@ -1469,7 +1463,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
 /* *********************************************************************  */
-static
 double spearman (int n, double** data1, double** data2, int** mask1,
   int** mask2, const double weight[], int index1, int index2, int transpose)
 /*
@@ -1597,7 +1590,6 @@ Otherwise, the distance between two columns in the matrix is calculated.
 /* *********************************************************************  */
-static
 double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
   const double weight[], int index1, int index2, int transpose)
 /*
@@ -1708,7 +1700,7 @@ Otherwise, the distance between two columns in the matrix is calculated.
 /* *********************************************************************  */
-static double(*setmetric(char dist))
+static double(*setmetric(char dist))
   (int, double**, double**, int**, int**, const double[], int, int, int)
 { switch(dist)
   { case 'e': return &euclid;
@@ -2203,7 +2195,7 @@ calculating the medians.
     }
   }
 }
 /* ********************************************************************* */
 int getclustercentroids(int nclusters, int nrows, int ncolumns,
@@ -2427,7 +2419,7 @@ kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
         break; /* Identical solution found; break out of this loop */
     }
-    if (npass<=1)
+    if (npass<=1)
     { *error = total;
       break;
     }
@@ -2532,7 +2524,7 @@ kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
         break; /* Identical solution found; break out of this loop */
     }
-    if (npass<=1)
+    if (npass<=1)
     { *error = total;
       break;
     }
@@ -2603,7 +2595,7 @@ of the matrix are clustered.
 npass      (input) int
 The number of times clustering is performed. Clustering is performed npass
-times, each time starting from a different (random) initial assignment of
+times, each time starting from a different (random) initial assignment of
 genes to clusters. The clustering solution with the lowest within-cluster sum
 of distances is chosen.
 If npass==0, then the clustering algorithm will be run once, where the initial
@@ -2697,7 +2689,7 @@ number of clusters is larger than the number of elements being clustered,
       return;
     }
   }
   if (method=='m')
   { double* cache = malloc(nelements*sizeof(double));
     if(cache)
@@ -3105,7 +3097,7 @@ weights array, the function returns NULL.
 /* ******************************************************************** */
-void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
+void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
 /*
 Purpose
@@ -3160,7 +3152,7 @@ error occured, all elements in clusterid are set to -1.
   }
   for (i = 0; i < n; i++) nodeid[i] = -1;
   for (i = n-1; i >= 0; i--)
-  { if(nodeid[i]<0)
+  { if(nodeid[i]<0)
     { j = icluster;
       nodeid[i] = j;
       icluster++;
@@ -3269,7 +3261,7 @@ If a memory error occurs, pclcluster returns NULL.
   if(!makedatamask(nelements, ndata, &newdata, &newmask))
   { free(result);
     free(distid);
-    return NULL;
+    return NULL;
   }
   for (i = 0; i < nelements; i++) distid[i] = i;
@@ -3313,7 +3305,7 @@ If a memory error occurs, pclcluster returns NULL.
     free(mask[is]);
     data[is] = data[nnodes-inode];
     mask[is] = mask[nnodes-inode];
     /* Fix the distances */
     distid[is] = distid[nnodes-inode];
     for (i = 0; i < is; i++)
@@ -3334,7 +3326,7 @@ If a memory error occurs, pclcluster returns NULL.
   free(data);
   free(mask);
   free(distid);
   return result;
 }
@@ -3829,7 +3821,7 @@ If a memory error occurs, treecluster returns NULL.
     for (i = 1; i < nelements; i++) free(distmatrix[i]);
     free (distmatrix);
   }
   return result;
 }
@@ -4037,7 +4029,7 @@ void somworker (int nrows, int ncolumns, double** data, int** mask,
 static
 void somassign (int nrows, int ncolumns, double** data, int** mask,
   const double weights[], int transpose, int nxgrid, int nygrid,
-  double*** celldata, char dist, int clusterid[][2])
+  double*** celldata, char dist, int **clusterid)
 /* Collect clusterids */
 { const int ndata = (transpose==0) ? ncolumns : nrows;
   int i,j;
@@ -4121,7 +4113,7 @@ void somassign (int nrows, int ncolumns, double** data, int** mask,
 void somcluster (int nrows, int ncolumns, double** data, int** mask,
   const double weight[], int transpose, int nxgrid, int nygrid,
-  double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
+  double inittau, int niter, char dist, double*** celldata, int **clusterid)
 /*
 Purpose
@@ -4235,7 +4227,7 @@ somcluster.
 double clusterdistance (int nrows, int ncolumns, double** data,
   int** mask, double weight[], int n1, int n2, int index1[], int index2[],
   char dist, char method, int transpose)
 /*
 Purpose
 =======

data/ext/cluster.h CHANGED

@@ -6,7 +6,7 @@
  * Human Genome Center, Institute of Medical Science, University of Tokyo,
  * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
  * Contact: mdehoon 'AT' gsc.riken.jp
- *
+ *
  * Permission to use, copy, modify, and distribute this software and its
  * documentation with or without modifications and for any purpose and
  * without fee is hereby granted, provided that any copyright notices
@@ -15,7 +15,7 @@
  * names of the contributors or copyright holders not be used in
  * advertising or publicity pertaining to distribution of the software
  * without specific prior permission.
- *
+ *
  * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
  * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
@@ -24,7 +24,7 @@
  * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
  * OR PERFORMANCE OF THIS SOFTWARE.
- *
+ *
  */
 #ifndef min
@@ -79,7 +79,7 @@ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[]);
 void somcluster (int nrows, int ncolumns, double** data, int** mask,
   const double weight[], int transpose, int nxnodes, int nynodes,
   double inittau, int niter, char dist, double*** celldata,
-  int clusterid[][2]);
+  int **clusterid);
 /* Chapter 6 */
 int pca(int m, int n, double** u, double** v, double* w);
@@ -91,3 +91,13 @@ double median (int n, double x[]);
 double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
   double weights[], int transpose, char dist, double cutoff, double exponent);
+/* distance functions */
+extern double euclid (int, double**, double**, int**, int**, const double [], int, int, int);
+extern double cityblock(int, double**, double**, int**, int**, const double [], int, int, int);
+extern double correlation(int, double**, double**, int**, int**, const double [], int, int, int);
+extern double acorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
+extern double ucorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
+extern double uacorrelation(int, double**, double**, int**, int**, const double [], int, int, int);
+extern double spearman(int, double**, double**, int**, int**, const double [], int, int, int);
+extern double kendall(int, double**, double**, int**, int**, const double [], int, int, int);

data/ext/flock.c CHANGED

@@ -5,6 +5,7 @@
 #define CONST_GET(scope, constant) (rb_funcall(scope, ID_CONST_GET, 1, rb_str_new2(constant)))
 static VALUE mFlock;
+typedef double (*distance_fn)(int, double**, double**, int**, int**, const double [], int, int, int);
 int opt_int_value(VALUE option, char *key, int def) {
   if (NIL_P(option)) return def;
@@ -13,6 +14,13 @@ int opt_int_value(VALUE option, char *key, int def) {
   return NIL_P(value) ? def : NUM2INT(value);
 }
+int opt_double_value(VALUE option, char *key, double def) {
+  if (NIL_P(option)) return def;
+  VALUE value = rb_hash_aref(option, ID2SYM(rb_intern(key)));
+  return NIL_P(value) ? def : NUM2DBL(value);
+}
 VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
     VALUE size, data, mask, weights, options;
     rb_scan_args(argc, argv, "22", &size, &data, &mask, &options);
@@ -26,6 +34,20 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
     if (NIL_P(size) || NUM2INT(rb_Integer(size)) > RARRAY_LEN(data))
         rb_raise(rb_eArgError, "size should be > 0 and <= data size");
+    int transpose = opt_int_value(options, "transpose", 0);
+    int npass     = opt_int_value(options, "iterations", 1000);
+    // a = average, m = means
+    int method    = opt_int_value(options, "method", 'a');
+    // e = euclidian,
+    // b = city-block distance
+    // c = correlation
+    // a = absolute value of the correlation
+    // u = uncentered correlation
+    // x = absolute uncentered correlation
+    // s = spearman's rank correlation
+    // k = kendall's tau
+    int dist      = opt_int_value(options, "metric", 'e');
     int i,j;
     int nrows = RARRAY_LEN(data);
     int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
@@ -33,16 +55,14 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
     double **cdata          = (double**)malloc(sizeof(double*)*nrows);
     int    **cmask          = (int   **)malloc(sizeof(int   *)*nrows);
-    double **ccentroid      = (double**)malloc(sizeof(double*)*nrows);
-    int    **ccentroid_mask = (int   **)malloc(sizeof(int   *)*nrows);
     double *cweights        = (double *)malloc(sizeof(double )*ncols);
-    int    *ccluster        = (int    *)malloc(sizeof(int    )*nrows);
+    double **ccentroid;
+    int *ccluster, **ccentroid_mask, dimx = nrows, dimy = ncols, cdimx = nsets, cdimy = ncols;
     for (i = 0; i < nrows; i++) {
         cdata[i]          = (double*)malloc(sizeof(double)*ncols);
         cmask[i]          = (int   *)malloc(sizeof(int   )*ncols);
-        ccentroid[i]      = (double*)malloc(sizeof(double)*ncols);
-        ccentroid_mask[i] = (int   *)malloc(sizeof(int   )*ncols);
         for (j = 0; j < ncols; j++) {
             cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
             cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
@@ -54,25 +74,27 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
         cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
     }
-    int transpose = opt_int_value(options, "transpose", 0);
-    int npass     = opt_int_value(options, "iterations", 1000);
-    // a = average, m = means
-    int method    = opt_int_value(options, "method", 'a');
-    // e = euclidian,
-    // b = city-block distance
-    // c = correlation
-    // a = absolute value of the correlation
-    // u = uncentered correlation
-    // x = absolute uncentered correlation
-    // s = spearman's rank correlation
-    // k = kendall's tau
-    int dist      = opt_int_value(options, "metric", 'e');
+    if (transpose) {
+        dimx  = ncols;
+        dimy  = nrows;
+        cdimx = nrows;
+        cdimy = nsets;
+    }
+    ccluster       = (int    *)malloc(sizeof(int    )*dimx);
+    ccentroid      = (double**)malloc(sizeof(double*)*cdimx);
+    ccentroid_mask = (int   **)malloc(sizeof(int   *)*cdimx);
+    for (i = 0; i < cdimx; i++) {
+      ccentroid[i]      = (double*)malloc(sizeof(double)*cdimy);
+      ccentroid_mask[i] = (int   *)malloc(sizeof(int   )*cdimy);
+    }
     int    ifound;
     double error;
     kcluster(nsets,
         nrows, ncols, cdata, cmask, cweights, transpose, npass, method, dist, ccluster, &error, &ifound);
     getclustercentroids(nsets,
         nrows, ncols, cdata, cmask, ccluster, ccentroid, ccentroid_mask, transpose, method);
@@ -80,10 +102,12 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
     VALUE cluster  = rb_ary_new();
     VALUE centroid = rb_ary_new();
-    for (i = 0; i < nrows; i++) {
+    for (i = 0; i < dimx; i++)
         rb_ary_push(cluster, INT2NUM(ccluster[i]));
+    for (i = 0; i < cdimx; i++) {
         VALUE point = rb_ary_new();
-        for (j = 0; j < ncols; j++)
+        for (j = 0; j < cdimy; j++)
             rb_ary_push(point, DBL2NUM(ccentroid[i][j]));
         rb_ary_push(centroid, point);
     }
@@ -96,6 +120,9 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
     for (i = 0; i < nrows; i++) {
         free(cdata[i]);
         free(cmask[i]);
+    }
+    for (i = 0; i < cdimx; i++) {
         free(ccentroid[i]);
         free(ccentroid_mask[i]);
     }
@@ -110,9 +137,203 @@ VALUE rb_kmeans(int argc, VALUE *argv, VALUE self) {
     return result;
 }
+VALUE rb_som(int argc, VALUE *argv, VALUE self) {
+    VALUE nx, ny, data, mask, weights, options;
+    rb_scan_args(argc, argv, "32", &nx, &ny, &data, &mask, &options);
+    if (TYPE(data) != T_ARRAY)
+        rb_raise(rb_eArgError, "data should be an array of arrays");
+    if (!NIL_P(mask) && TYPE(mask) != T_ARRAY)
+        rb_raise(rb_eArgError, "mask should be an array of arrays");
+    if (NIL_P(nx) || NUM2INT(rb_Integer(nx)) <= 0)
+        rb_raise(rb_eArgError, "nx should be > 0");
+    if (NIL_P(ny) || NUM2INT(rb_Integer(ny)) <= 0)
+        rb_raise(rb_eArgError, "ny should be > 0");
+    int nxgrid    = NUM2INT(rb_Integer(nx));
+    int nygrid    = NUM2INT(rb_Integer(ny));
+    int transpose = opt_int_value(options, "transpose", 0);
+    int npass     = opt_int_value(options, "iterations", 1000);
+    // e = euclidian,
+    // b = city-block distance
+    // c = correlation
+    // a = absolute value of the correlation
+    // u = uncentered correlation
+    // x = absolute uncentered correlation
+    // s = spearman's rank correlation
+    // k = kendall's tau
+    int dist      = opt_int_value(options, "metric", 'e');
+    double tau    = opt_double_value(options, "tau", 1.0);
+    int i, j, k;
+    int nrows = RARRAY_LEN(data);
+    int ncols = RARRAY_LEN(rb_ary_entry(data, 0));
+    double **cdata          = (double**)malloc(sizeof(double*)*nrows);
+    int    **cmask          = (int   **)malloc(sizeof(int   *)*nrows);
+    double *cweights        = (double *)malloc(sizeof(double )*ncols);
+    int **ccluster;
+    double ***ccelldata;
+    int dimx = nrows, dimy = ncols;
+    if (transpose) {
+        dimx = ncols;
+        dimy = nrows;
+    }
+    ccluster = (int **)malloc(sizeof(int*)*dimx);
+    for (i = 0; i < dimx; i++)
+        ccluster[i] = (int*)malloc(sizeof(int)*2);
+    for (i = 0; i < nrows; i++) {
+        cdata[i]          = (double*)malloc(sizeof(double)*ncols);
+        cmask[i]          = (int   *)malloc(sizeof(int   )*ncols);
+        for (j = 0; j < ncols; j++) {
+            cdata[i][j] = NUM2DBL(rb_Float(rb_ary_entry(rb_ary_entry(data, i), j)));
+            cmask[i][j] = NIL_P(mask) ? 1 : NUM2INT(rb_Integer(rb_ary_entry(rb_ary_entry(mask, i), j)));
+        }
+    }
+    weights = NIL_P(options) ? Qnil : rb_hash_aref(options, ID2SYM(rb_intern("weights")));
+    for (i = 0; i < ncols; i++) {
+        cweights[i] = NIL_P(weights) ? 1.0 : NUM2DBL(rb_Float(rb_ary_entry(weights, i)));
+    }
+    ccelldata = (double***)malloc(sizeof(double**)*nxgrid);
+    for (i = 0; i < nxgrid; i++) {
+        ccelldata[i] = (double **)malloc(sizeof(double*)*nygrid);
+        for (j = 0; j < nygrid; j++)
+            ccelldata[i][j] = (double *)malloc(sizeof(double)*dimy);
+    }
+    somcluster(nrows, ncols, cdata, cmask, cweights, transpose,
+        nxgrid, nygrid, tau, npass, dist, ccelldata, ccluster);
+    VALUE result   = rb_hash_new();
+    VALUE cluster  = rb_ary_new();
+    VALUE centroid = rb_ary_new();
+    for (i = 0; i < dimx; i++)
+        rb_ary_push(cluster, INT2NUM(ccluster[i][0] * nxgrid + ccluster[i][1]));
+    for (i = 0; i < nxgrid; i++) {
+        for (j = 0; j < nygrid; j++) {
+            VALUE point = rb_ary_new();
+            for (k = 0; k < dimy; k++)
+                rb_ary_push(point, DBL2NUM(ccelldata[i][j][k]));
+            rb_ary_push(centroid, point);
+        }
+    }
+    rb_hash_aset(result, ID2SYM(rb_intern("cluster")),   cluster);
+    rb_hash_aset(result, ID2SYM(rb_intern("centroid")),  centroid);
+    for (i = 0; i < nrows; i++) {
+        free(cdata[i]);
+        free(cmask[i]);
+    }
+    for (i = 0; i < dimx; i++)
+        free(ccluster[i]);
+    for (i = 0; i < nxgrid; i++) {
+        for (j = 0; j < nygrid; j++)
+            free(ccelldata[i][j]);
+        free(ccelldata[i]);
+    }
+    free(cdata);
+    free(cmask);
+    free(ccelldata);
+    free(cweights);
+    free(ccluster);
+    return result;
+}
+VALUE rb_distance(VALUE vec1, VALUE vec2, distance_fn fn) {
+    uint32_t size;
+    double *data1, *data2, *weight, dist;
+    int *mask, i;
+    if (TYPE(vec1) != T_ARRAY)
+        rb_raise(rb_eArgError, "vector1 should be an array");
+    if (TYPE(vec2) != T_ARRAY)
+        rb_raise(rb_eArgError, "vector2 should be an array");
+    size = RARRAY_LEN(vec1);
+    if (size != RARRAY_LEN(vec2))
+        rb_raise(rb_eArgError, "vector1 & vector2 dimensions mismatch");
+    if (size < 1)
+        rb_raise(rb_eArgError, "dimension should be greater than 0");
+    data1  = (double *)malloc(sizeof(double)*size);
+    data2  = (double *)malloc(sizeof(double)*size);
+    weight = (double *)malloc(sizeof(double)*size);
+    mask   = (int *)malloc(sizeof(int)*size);
+    for (i = 0; i < size; i++) {
+        mask[i]   = 1;
+        weight[i] = 1;
+        data1[i]  = NUM2DBL(rb_ary_entry(vec1, i));
+        data2[i]  = NUM2DBL(rb_ary_entry(vec2, i));
+    }
+    dist = fn(size, &data1, &data2, &mask, &mask, weight, 0, 0, 0);
+    free(mask);
+    free(weight);
+    free(data2);
+    free(data1);
+    return DBL2NUM(dist);
+}
+VALUE rb_euclid(VALUE self, VALUE vec1, VALUE vec2) {
+    return rb_distance(vec1, vec2, euclid);
+}
+VALUE rb_cityblock(VALUE self, VALUE vec1, VALUE vec2) {
+    return rb_distance(vec1, vec2, cityblock);
+}
+VALUE rb_correlation(VALUE self, VALUE vec1, VALUE vec2) {
+    return rb_distance(vec1, vec2, correlation);
+}
+VALUE rb_ucorrelation(VALUE self, VALUE vec1, VALUE vec2) {
+    return rb_distance(vec1, vec2, ucorrelation);
+}
+VALUE rb_acorrelation(VALUE self, VALUE vec1, VALUE vec2) {
+    return rb_distance(vec1, vec2, acorrelation);
+}
+VALUE rb_uacorrelation(VALUE self, VALUE vec1, VALUE vec2) {
+    return rb_distance(vec1, vec2, uacorrelation);
+}
+VALUE rb_spearman(VALUE self, VALUE vec1, VALUE vec2) {
+    return rb_distance(vec1, vec2, spearman);
+}
+VALUE rb_kendall(VALUE self, VALUE vec1, VALUE vec2) {
+    return rb_distance(vec1, vec2, kendall);
+}
 void Init_flock(void) {
     mFlock = rb_define_module("Flock");
     rb_define_module_function(mFlock, "kmeans", RUBY_METHOD_FUNC(rb_kmeans), -1);
+    rb_define_module_function(mFlock, "self_organizing_map", RUBY_METHOD_FUNC(rb_som), -1);
     rb_define_const(mFlock, "METHOD_AVERAGE", INT2NUM('a'));
     rb_define_const(mFlock, "METHOD_MEDIAN",  INT2NUM('m'));
@@ -125,4 +346,13 @@ void Init_flock(void) {
     rb_define_const(mFlock, "METRIC_ABSOLUTE_UNCENTERED_CORRELATION", INT2NUM('x'));
     rb_define_const(mFlock, "METRIC_SPEARMAN",                        INT2NUM('s'));
     rb_define_const(mFlock, "METRIC_KENDALL",                         INT2NUM('k'));
+    rb_define_module_function(mFlock, "euclidian_distance", RUBY_METHOD_FUNC(rb_euclid), 2);
+    rb_define_module_function(mFlock, "cityblock_distance", RUBY_METHOD_FUNC(rb_cityblock), 2);
+    rb_define_module_function(mFlock, "correlation_distance", RUBY_METHOD_FUNC(rb_correlation), 2);
+    rb_define_module_function(mFlock, "absolute_correlation_distance", RUBY_METHOD_FUNC(rb_acorrelation), 2);
+    rb_define_module_function(mFlock, "uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_ucorrelation), 2);
+    rb_define_module_function(mFlock, "absolute_uncentered_correlation_distance", RUBY_METHOD_FUNC(rb_uacorrelation), 2);
+    rb_define_module_function(mFlock, "spearman_distance", RUBY_METHOD_FUNC(rb_spearman), 2);
+    rb_define_module_function(mFlock, "kendall_distance", RUBY_METHOD_FUNC(rb_kendall), 2);
 }

data/flock.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = %q{flock}
-  s.version = "0.2.1"
+  s.version = "0.3.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Bharanee Rathna"]
-  s.date = %q{2011-02-19}
+  s.date = %q{2011-04-24}
   s.description = %q{A thin ruby binding to Cluster 3.0}
   s.email = ["deepfryed@gmail.com"]
   s.extensions = ["ext/extconf.rb"]
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
   s.summary = %q{Ruby bindings to Cluster 3.0.}
   s.test_files = [
     "examples/sparse.rb",
+     "examples/som.rb",
      "examples/dense.rb"
   ]

data/lib/flock.rb CHANGED

@@ -21,7 +21,7 @@ module Flock
     [dims,data]
   end
-  def self.sparse_kmeans size, sparse_data, options={}
+  def self.sparse_kmeans size, sparse_data, options = {}
     dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
     if options.key?(:weights)
@@ -32,4 +32,16 @@ module Flock
     kmeans(size, data, nil, options)
   end
+  def self.sparse_self_organizing_map nx, ny, sparse_data, options = {}
+    dims, data = sparse_data[0].kind_of?(Array) ? sparse_array_to_data(sparse_data) : sparse_hash_to_data(sparse_data)
+    if options.key?(:weights)
+      weights = Array.new(dims.size) {1}
+      options[:weights].each {|k,v| weights[dims[k]] = v }
+      options[:weights] = weights
+    end
+    self_organizing_map(nx, ny, data, nil, options)
+  end
 end

metadata CHANGED

@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
   prerelease: false
   segments:
   - 0
-  - 2
-  - 1
-  version: 0.2.1
+  - 3
+  - 0
+  version: 0.3.0
 platform: ruby
 authors:
 - Bharanee Rathna
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-02-19 00:00:00 +11:00
+date: 2011-04-24 00:00:00 +10:00
 default_executable:
 dependencies: []
@@ -38,6 +38,7 @@ files:
 - flock.gemspec
 - lib/flock.rb
 - examples/sparse.rb
+- examples/som.rb
 - examples/dense.rb
 has_rdoc: true
 homepage: http://github.com/deepfryed/flock
@@ -73,4 +74,5 @@ specification_version: 3
 summary: Ruby bindings to Cluster 3.0.
 test_files:
 - examples/sparse.rb
+- examples/som.rb
 - examples/dense.rb