rbcluster 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,9 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ *.rbc
6
+ *.bundle
7
+ *.so
8
+ tmp/
9
+ .tm_properties
@@ -0,0 +1,6 @@
1
+ rvm:
2
+ - 1.8.7
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - ruby-head
6
+ - rbx
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in rbcluster.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,29 @@
1
+ rbcluster
2
+ Copyright (C) 2011-2012 Jari Bakken.
3
+
4
+ The C clustering library.
5
+ Copyright (C) 2002 Michiel Jan Laurens de Hoon.
6
+
7
+ This library was written at the Laboratory of DNA Information Analysis,
8
+ Human Genome Center, Institute of Medical Science, University of Tokyo,
9
+ 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
10
+ Contact: mdehoon 'AT' gsc.riken.jp
11
+
12
+ Permission to use, copy, modify, and distribute this software and its
13
+ documentation with or without modifications and for any purpose and
14
+ without fee is hereby granted, provided that any copyright notices
15
+ appear in all copies and that both those copyright notices and this
16
+ permission notice appear in supporting documentation, and that the
17
+ names of the contributors or copyright holders not be used in
18
+ advertising or publicity pertaining to distribution of the software
19
+ without specific prior permission.
20
+
21
+ THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
22
+ WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
23
+ WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
24
+ CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
25
+ OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
26
+ OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
27
+ OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
28
+ OR PERFORMANCE OF THIS SOFTWARE.
29
+
@@ -0,0 +1,54 @@
1
+ rbcluster
2
+ =========
3
+
4
+ Ruby bindings to the Cluster C library.
5
+
6
+ TODO
7
+ ----
8
+
9
+ Functions:
10
+
11
+ * Cluster.clustercentroids
12
+ * Cluster.clustermedoids
13
+ * Cluster::Tree#{cut,slice,[],fetch}
14
+
15
+ Other:
16
+
17
+ * an examples/ folder
18
+ * make :transpose work
19
+ * specs for bad inputs
20
+
21
+ DONE
22
+ ----
23
+
24
+ * Cluster.pca
25
+ * Cluster.somcluster
26
+ * Cluster.treecluster
27
+ * Cluster.clusterdistance
28
+ * Cluster.kcluster
29
+ * Cluster.kmedoids
30
+ * Cluster.distancematrix
31
+ * Cluster.median
32
+ * Cluster.mean
33
+
34
+ See also
35
+ --------
36
+
37
+ * http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm
38
+ * http://bonsai.hgc.jp/~mdehoon/software/cluster/cluster.pdf
39
+
40
+ Note on Patches/Pull Requests
41
+ -----------------------------
42
+
43
+ * Fork the project.
44
+ * Make your feature addition or bug fix.
45
+ * Add tests for it. This is important so I don't break it in a
46
+ future version unintentionally.
47
+ * Commit, do not mess with rakefile, version, or history.
48
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
49
+ * Send me a pull request. Bonus points for topic branches.
50
+
51
+ Copyright
52
+ ---------
53
+
54
+ Copyright (c) 2011-2012 Jari Bakken. See LICENSE for details.
@@ -0,0 +1,17 @@
1
+ include Rake::DSL if defined?(Rake::DSL)
2
+
3
+ require 'bundler'
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ require 'rake/extensiontask'
7
+ Rake::ExtensionTask.new do |ext|
8
+ ext.name = "rbcluster"
9
+ ext.lib_dir = "lib/rbcluster"
10
+ ext.ext_dir = "ext/rbcluster"
11
+ ext.gem_spec = eval(File.read("./rbcluster.gemspec"))
12
+ end
13
+
14
+ require 'rspec/core/rake_task'
15
+ RSpec::Core::RakeTask.new
16
+
17
+ task :default => [:compile, :spec]
@@ -0,0 +1,10 @@
1
+ require 'rbcluster'
2
+
3
+ data = [
4
+ [1, 1, 0],
5
+ [1, 0, 0],
6
+ [0, 0, 0]
7
+ ]
8
+
9
+ labels, error, nfound = Cluster.kcluster(data, :clusters => 2)
10
+ p [labels, error, nfound]
@@ -0,0 +1,4598 @@
1
+ /* The C clustering library.
2
+ * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
3
+ *
4
+ * This library was written at the Laboratory of DNA Information Analysis,
5
+ * Human Genome Center, Institute of Medical Science, University of Tokyo,
6
+ * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
7
+ * Contact: mdehoon 'AT' gsc.riken.jp
8
+ *
9
+ * Permission to use, copy, modify, and distribute this software and its
10
+ * documentation with or without modifications and for any purpose and
11
+ * without fee is hereby granted, provided that any copyright notices
12
+ * appear in all copies and that both those copyright notices and this
13
+ * permission notice appear in supporting documentation, and that the
14
+ * names of the contributors or copyright holders not be used in
15
+ * advertising or publicity pertaining to distribution of the software
16
+ * without specific prior permission.
17
+ *
18
+ * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
19
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
20
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
21
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
22
+ * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25
+ * OR PERFORMANCE OF THIS SOFTWARE.
26
+ *
27
+ */
28
+
29
+ #include <time.h>
30
+ #include <stdlib.h>
31
+ #include <math.h>
32
+ #include <float.h>
33
+ #include <limits.h>
34
+ #include <string.h>
35
+ #include "cluster.h"
36
+ #ifdef WINDOWS
37
+ # include <windows.h>
38
+ #endif
39
+
40
+ /* ************************************************************************ */
41
+
42
+ #ifdef WINDOWS
43
+ /* Then we make a Windows DLL */
44
+ int WINAPI
45
+ clusterdll_init (HANDLE h, DWORD reason, void* foo)
46
+ {
47
+ return 1;
48
+ }
49
+ #endif
50
+
51
+ /* ************************************************************************ */
52
+
53
+ double mean(int n, double x[])
54
+ { double result = 0.;
55
+ int i;
56
+ for (i = 0; i < n; i++) result += x[i];
57
+ result /= n;
58
+ return result;
59
+ }
60
+
61
+ /* ************************************************************************ */
62
+
63
+ double median (int n, double x[])
64
+ /*
65
+ Find the median of X(1), ... , X(N), using as much of the quicksort
66
+ algorithm as is needed to isolate it.
67
+ N.B. On exit, the array X is partially ordered.
68
+ Based on Alan J. Miller's median.f90 routine.
69
+ */
70
+
71
+ { int i, j;
72
+ int nr = n / 2;
73
+ int nl = nr - 1;
74
+ int even = 0;
75
+ /* hi & lo are position limits encompassing the median. */
76
+ int lo = 0;
77
+ int hi = n-1;
78
+
79
+ if (n==2*nr) even = 1;
80
+ if (n<3)
81
+ { if (n<1) return 0.;
82
+ if (n == 1) return x[0];
83
+ return 0.5*(x[0]+x[1]);
84
+ }
85
+
86
+ /* Find median of 1st, middle & last values. */
87
+ do
88
+ { int loop;
89
+ int mid = (lo + hi)/2;
90
+ double result = x[mid];
91
+ double xlo = x[lo];
92
+ double xhi = x[hi];
93
+ if (xhi<xlo)
94
+ { double temp = xlo;
95
+ xlo = xhi;
96
+ xhi = temp;
97
+ }
98
+ if (result>xhi) result = xhi;
99
+ else if (result<xlo) result = xlo;
100
+ /* The basic quicksort algorithm to move all values <= the sort key (XMED)
101
+ * to the left-hand end, and all higher values to the other end.
102
+ */
103
+ i = lo;
104
+ j = hi;
105
+ do
106
+ { while (x[i]<result) i++;
107
+ while (x[j]>result) j--;
108
+ loop = 0;
109
+ if (i<j)
110
+ { double temp = x[i];
111
+ x[i] = x[j];
112
+ x[j] = temp;
113
+ i++;
114
+ j--;
115
+ if (i<=j) loop = 1;
116
+ }
117
+ } while (loop); /* Decide which half the median is in. */
118
+
119
+ if (even)
120
+ { if (j==nl && i==nr)
121
+ /* Special case, n even, j = n/2 & i = j + 1, so the median is
122
+ * between the two halves of the series. Find max. of the first
123
+ * half & min. of the second half, then average.
124
+ */
125
+ { int k;
126
+ double xmax = x[0];
127
+ double xmin = x[n-1];
128
+ for (k = lo; k <= j; k++) xmax = max(xmax,x[k]);
129
+ for (k = i; k <= hi; k++) xmin = min(xmin,x[k]);
130
+ return 0.5*(xmin + xmax);
131
+ }
132
+ if (j<nl) lo = i;
133
+ if (i>nr) hi = j;
134
+ if (i==j)
135
+ { if (i==nl) lo = nl;
136
+ if (j==nr) hi = nr;
137
+ }
138
+ }
139
+ else
140
+ { if (j<nr) lo = i;
141
+ if (i>nr) hi = j;
142
+ /* Test whether median has been isolated. */
143
+ if (i==j && i==nr) return result;
144
+ }
145
+ }
146
+ while (lo<hi-1);
147
+
148
+ if (even) return (0.5*(x[nl]+x[nr]));
149
+ if (x[lo]>x[hi])
150
+ { double temp = x[lo];
151
+ x[lo] = x[hi];
152
+ x[hi] = temp;
153
+ }
154
+ return x[nr];
155
+ }
156
+
157
+ /* ********************************************************************** */
158
+
159
+ static const double* sortdata = NULL; /* used in the quicksort algorithm */
160
+
161
+ /* ---------------------------------------------------------------------- */
162
+
163
+ static
164
+ int compare(const void* a, const void* b)
165
+ /* Helper function for sort. Previously, this was a nested function under
166
+ * sort, which is not allowed under ANSI C.
167
+ */
168
+ { const int i1 = *(const int*)a;
169
+ const int i2 = *(const int*)b;
170
+ const double term1 = sortdata[i1];
171
+ const double term2 = sortdata[i2];
172
+ if (term1 < term2) return -1;
173
+ if (term1 > term2) return +1;
174
+ return 0;
175
+ }
176
+
177
+ /* ---------------------------------------------------------------------- */
178
+
179
+ void sort(int n, const double data[], int index[])
180
+ /* Sets up an index table given the data, such that data[index[]] is in
181
+ * increasing order. Sorting is done on the indices; the array data
182
+ * is unchanged.
183
+ */
184
+ { int i;
185
+ sortdata = data;
186
+ for (i = 0; i < n; i++) index[i] = i;
187
+ qsort(index, n, sizeof(int), compare);
188
+ }
189
+
190
+ /* ********************************************************************** */
191
+
192
+ static double* getrank (int n, double data[])
193
+ /* Calculates the ranks of the elements in the array data. Two elements with
194
+ * the same value get the same rank, equal to the average of the ranks had the
195
+ * elements different values. The ranks are returned as a newly allocated
196
+ * array that should be freed by the calling routine. If getrank fails due to
197
+ * a memory allocation error, it returns NULL.
198
+ */
199
+ { int i;
200
+ double* rank;
201
+ int* index;
202
+ rank = malloc(n*sizeof(double));
203
+ if (!rank) return NULL;
204
+ index = malloc(n*sizeof(int));
205
+ if (!index)
206
+ { free(rank);
207
+ return NULL;
208
+ }
209
+ /* Call sort to get an index table */
210
+ sort (n, data, index);
211
+ /* Build a rank table */
212
+ for (i = 0; i < n; i++) rank[index[i]] = i;
213
+ /* Fix for equal ranks */
214
+ i = 0;
215
+ while (i < n)
216
+ { int m;
217
+ double value = data[index[i]];
218
+ int j = i + 1;
219
+ while (j < n && data[index[j]] == value) j++;
220
+ m = j - i; /* number of equal ranks found */
221
+ value = rank[index[i]] + (m-1)/2.;
222
+ for (j = i; j < i + m; j++) rank[index[j]] = value;
223
+ i += m;
224
+ }
225
+ free (index);
226
+ return rank;
227
+ }
228
+
229
+ /* ---------------------------------------------------------------------- */
230
+
231
+ static int
232
+ makedatamask(int nrows, int ncols, double*** pdata, int*** pmask)
233
+ { int i;
234
+ double** data;
235
+ int** mask;
236
+ data = malloc(nrows*sizeof(double*));
237
+ if(!data) return 0;
238
+ mask = malloc(nrows*sizeof(int*));
239
+ if(!mask)
240
+ { free(data);
241
+ return 0;
242
+ }
243
+ for (i = 0; i < nrows; i++)
244
+ { data[i] = malloc(ncols*sizeof(double));
245
+ if(!data[i]) break;
246
+ mask[i] = malloc(ncols*sizeof(int));
247
+ if(!mask[i])
248
+ { free(data[i]);
249
+ break;
250
+ }
251
+ }
252
+ if (i==nrows) /* break not encountered */
253
+ { *pdata = data;
254
+ *pmask = mask;
255
+ return 1;
256
+ }
257
+ *pdata = NULL;
258
+ *pmask = NULL;
259
+ nrows = i;
260
+ for (i = 0; i < nrows; i++)
261
+ { free(data[i]);
262
+ free(mask[i]);
263
+ }
264
+ free(data);
265
+ free(mask);
266
+ return 0;
267
+ }
268
+
269
+ /* ---------------------------------------------------------------------- */
270
+
271
+ static void
272
+ freedatamask(int n, double** data, int** mask)
273
+ { int i;
274
+ for (i = 0; i < n; i++)
275
+ { free(mask[i]);
276
+ free(data[i]);
277
+ }
278
+ free(mask);
279
+ free(data);
280
+ }
281
+
282
+ /* ---------------------------------------------------------------------- */
283
+
284
+ static
285
+ double find_closest_pair(int n, double** distmatrix, int* ip, int* jp)
286
+ /*
287
+ This function searches the distance matrix to find the pair with the shortest
288
+ distance between them. The indices of the pair are returned in ip and jp; the
289
+ distance itself is returned by the function.
290
+
291
+ n (input) int
292
+ The number of elements in the distance matrix.
293
+
294
+ distmatrix (input) double**
295
+ A ragged array containing the distance matrix. The number of columns in each
296
+ row is one less than the row index.
297
+
298
+ ip (output) int*
299
+ A pointer to the integer that is to receive the first index of the pair with
300
+ the shortest distance.
301
+
302
+ jp (output) int*
303
+ A pointer to the integer that is to receive the second index of the pair with
304
+ the shortest distance.
305
+ */
306
+ { int i, j;
307
+ double temp;
308
+ double distance = distmatrix[1][0];
309
+ *ip = 1;
310
+ *jp = 0;
311
+ for (i = 1; i < n; i++)
312
+ { for (j = 0; j < i; j++)
313
+ { temp = distmatrix[i][j];
314
+ if (temp<distance)
315
+ { distance = temp;
316
+ *ip = i;
317
+ *jp = j;
318
+ }
319
+ }
320
+ }
321
+ return distance;
322
+ }
323
+
324
+ /* ********************************************************************* */
325
+
326
+ static int svd(int m, int n, double** u, double w[], double** vt)
327
+ /*
328
+ * This subroutine is a translation of the Algol procedure svd,
329
+ * Num. Math. 14, 403-420(1970) by Golub and Reinsch.
330
+ * Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971).
331
+ *
332
+ * This subroutine determines the singular value decomposition
333
+ * t
334
+ * A=usv of a real m by n rectangular matrix, where m is greater
335
+ * than or equal to n. Householder bidiagonalization and a variant
336
+ * of the QR algorithm are used.
337
+ *
338
+ *
339
+ * On input.
340
+ *
341
+ * m is the number of rows of A (and u).
342
+ *
343
+ * n is the number of columns of A (and u) and the order of v.
344
+ *
345
+ * u contains the rectangular input matrix A to be decomposed.
346
+ *
347
+ * On output.
348
+ *
349
+ * the routine returns an integer ierr equal to
350
+ * 0 to indicate a normal return,
351
+ * k if the k-th singular value has not been
352
+ * determined after 30 iterations,
353
+ * -1 if memory allocation fails.
354
+ *
355
+ *
356
+ * w contains the n (non-negative) singular values of a (the
357
+ * diagonal elements of s). they are unordered. if an
358
+ * error exit is made, the singular values should be correct
359
+ * for indices ierr+1,ierr+2,...,n.
360
+ *
361
+ *
362
+ * u contains the matrix u (orthogonal column vectors) of the
363
+ * decomposition.
364
+ * if an error exit is made, the columns of u corresponding
365
+ * to indices of correct singular values should be correct.
366
+ *
367
+ * t
368
+ * vt contains the matrix v (orthogonal) of the decomposition.
369
+ * if an error exit is made, the columns of v corresponding
370
+ * to indices of correct singular values should be correct.
371
+ *
372
+ *
373
+ * Questions and comments should be directed to B. S. Garbow,
374
+ * Applied Mathematics division, Argonne National Laboratory
375
+ *
376
+ * Modified to eliminate machep
377
+ *
378
+ * Translated to C by Michiel de Hoon, Human Genome Center,
379
+ * University of Tokyo, for inclusion in the C Clustering Library.
380
+ * This routine is less general than the original svd routine, as
381
+ * it focuses on the singular value decomposition as needed for
382
+ * clustering. In particular,
383
+ * - We calculate both u and v in all cases
384
+ * - We pass the input array A via u; this array is subsequently
385
+ * overwritten.
386
+ * - We allocate for the array rv1, used as a working space,
387
+ * internally in this routine, instead of passing it as an
388
+ * argument. If the allocation fails, svd returns -1.
389
+ * 2003.06.05
390
+ */
391
+ { int i, j, k, i1, k1, l1, its;
392
+ double c,f,h,s,x,y,z;
393
+ int l = 0;
394
+ int ierr = 0;
395
+ double g = 0.0;
396
+ double scale = 0.0;
397
+ double anorm = 0.0;
398
+ double* rv1 = malloc(n*sizeof(double));
399
+ if (!rv1) return -1;
400
+ if (m >= n)
401
+ { /* Householder reduction to bidiagonal form */
402
+ for (i = 0; i < n; i++)
403
+ { l = i + 1;
404
+ rv1[i] = scale * g;
405
+ g = 0.0;
406
+ s = 0.0;
407
+ scale = 0.0;
408
+ for (k = i; k < m; k++) scale += fabs(u[k][i]);
409
+ if (scale != 0.0)
410
+ { for (k = i; k < m; k++)
411
+ { u[k][i] /= scale;
412
+ s += u[k][i]*u[k][i];
413
+ }
414
+ f = u[i][i];
415
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
416
+ h = f * g - s;
417
+ u[i][i] = f - g;
418
+ if (i < n-1)
419
+ { for (j = l; j < n; j++)
420
+ { s = 0.0;
421
+ for (k = i; k < m; k++) s += u[k][i] * u[k][j];
422
+ f = s / h;
423
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
424
+ }
425
+ }
426
+ for (k = i; k < m; k++) u[k][i] *= scale;
427
+ }
428
+ w[i] = scale * g;
429
+ g = 0.0;
430
+ s = 0.0;
431
+ scale = 0.0;
432
+ if (i<n-1)
433
+ { for (k = l; k < n; k++) scale += fabs(u[i][k]);
434
+ if (scale != 0.0)
435
+ { for (k = l; k < n; k++)
436
+ { u[i][k] /= scale;
437
+ s += u[i][k] * u[i][k];
438
+ }
439
+ f = u[i][l];
440
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
441
+ h = f * g - s;
442
+ u[i][l] = f - g;
443
+ for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
444
+ for (j = l; j < m; j++)
445
+ { s = 0.0;
446
+ for (k = l; k < n; k++) s += u[j][k] * u[i][k];
447
+ for (k = l; k < n; k++) u[j][k] += s * rv1[k];
448
+ }
449
+ for (k = l; k < n; k++) u[i][k] *= scale;
450
+ }
451
+ }
452
+ anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
453
+ }
454
+ /* accumulation of right-hand transformations */
455
+ for (i = n-1; i>=0; i--)
456
+ { if (i < n-1)
457
+ { if (g != 0.0)
458
+ { for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g;
459
+ /* double division avoids possible underflow */
460
+ for (j = l; j < n; j++)
461
+ { s = 0.0;
462
+ for (k = l; k < n; k++) s += u[i][k] * vt[j][k];
463
+ for (k = l; k < n; k++) vt[j][k] += s * vt[i][k];
464
+ }
465
+ }
466
+ }
467
+ for (j = l; j < n; j++)
468
+ { vt[j][i] = 0.0;
469
+ vt[i][j] = 0.0;
470
+ }
471
+ vt[i][i] = 1.0;
472
+ g = rv1[i];
473
+ l = i;
474
+ }
475
+ /* accumulation of left-hand transformations */
476
+ for (i = n-1; i >= 0; i--)
477
+ { l = i + 1;
478
+ g = w[i];
479
+ if (i!=n-1)
480
+ for (j = l; j < n; j++) u[i][j] = 0.0;
481
+ if (g!=0.0)
482
+ { if (i!=n-1)
483
+ { for (j = l; j < n; j++)
484
+ { s = 0.0;
485
+ for (k = l; k < m; k++) s += u[k][i] * u[k][j];
486
+ /* double division avoids possible underflow */
487
+ f = (s / u[i][i]) / g;
488
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
489
+ }
490
+ }
491
+ for (j = i; j < m; j++) u[j][i] /= g;
492
+ }
493
+ else
494
+ for (j = i; j < m; j++) u[j][i] = 0.0;
495
+ u[i][i] += 1.0;
496
+ }
497
+ /* diagonalization of the bidiagonal form */
498
+ for (k = n-1; k >= 0; k--)
499
+ { k1 = k-1;
500
+ its = 0;
501
+ while(1)
502
+ /* test for splitting */
503
+ { for (l = k; l >= 0; l--)
504
+ { l1 = l-1;
505
+ if (fabs(rv1[l]) + anorm == anorm) break;
506
+ /* rv1[0] is always zero, so there is no exit
507
+ * through the bottom of the loop */
508
+ if (fabs(w[l1]) + anorm == anorm)
509
+ /* cancellation of rv1[l] if l greater than 0 */
510
+ { c = 0.0;
511
+ s = 1.0;
512
+ for (i = l; i <= k; i++)
513
+ { f = s * rv1[i];
514
+ rv1[i] *= c;
515
+ if (fabs(f) + anorm == anorm) break;
516
+ g = w[i];
517
+ h = sqrt(f*f+g*g);
518
+ w[i] = h;
519
+ c = g / h;
520
+ s = -f / h;
521
+ for (j = 0; j < m; j++)
522
+ { y = u[j][l1];
523
+ z = u[j][i];
524
+ u[j][l1] = y * c + z * s;
525
+ u[j][i] = -y * s + z * c;
526
+ }
527
+ }
528
+ break;
529
+ }
530
+ }
531
+ /* test for convergence */
532
+ z = w[k];
533
+ if (l==k) /* convergence */
534
+ { if (z < 0.0)
535
+ /* w[k] is made non-negative */
536
+ { w[k] = -z;
537
+ for (j = 0; j < n; j++) vt[k][j] = -vt[k][j];
538
+ }
539
+ break;
540
+ }
541
+ else if (its==30)
542
+ { ierr = k;
543
+ break;
544
+ }
545
+ else
546
+ /* shift from bottom 2 by 2 minor */
547
+ { its++;
548
+ x = w[l];
549
+ y = w[k1];
550
+ g = rv1[k1];
551
+ h = rv1[k];
552
+ f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
553
+ g = sqrt(f*f+1.0);
554
+ f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
555
+ /* next qr transformation */
556
+ c = 1.0;
557
+ s = 1.0;
558
+ for (i1 = l; i1 <= k1; i1++)
559
+ { i = i1 + 1;
560
+ g = rv1[i];
561
+ y = w[i];
562
+ h = s * g;
563
+ g = c * g;
564
+ z = sqrt(f*f+h*h);
565
+ rv1[i1] = z;
566
+ c = f / z;
567
+ s = h / z;
568
+ f = x * c + g * s;
569
+ g = -x * s + g * c;
570
+ h = y * s;
571
+ y = y * c;
572
+ for (j = 0; j < n; j++)
573
+ { x = vt[i1][j];
574
+ z = vt[i][j];
575
+ vt[i1][j] = x * c + z * s;
576
+ vt[i][j] = -x * s + z * c;
577
+ }
578
+ z = sqrt(f*f+h*h);
579
+ w[i1] = z;
580
+ /* rotation can be arbitrary if z is zero */
581
+ if (z!=0.0)
582
+ { c = f / z;
583
+ s = h / z;
584
+ }
585
+ f = c * g + s * y;
586
+ x = -s * g + c * y;
587
+ for (j = 0; j < m; j++)
588
+ { y = u[j][i1];
589
+ z = u[j][i];
590
+ u[j][i1] = y * c + z * s;
591
+ u[j][i] = -y * s + z * c;
592
+ }
593
+ }
594
+ rv1[l] = 0.0;
595
+ rv1[k] = f;
596
+ w[k] = x;
597
+ }
598
+ }
599
+ }
600
+ }
601
+ else /* m < n */
602
+ { /* Householder reduction to bidiagonal form */
603
+ for (i = 0; i < m; i++)
604
+ { l = i + 1;
605
+ rv1[i] = scale * g;
606
+ g = 0.0;
607
+ s = 0.0;
608
+ scale = 0.0;
609
+ for (k = i; k < n; k++) scale += fabs(u[i][k]);
610
+ if (scale != 0.0)
611
+ { for (k = i; k < n; k++)
612
+ { u[i][k] /= scale;
613
+ s += u[i][k]*u[i][k];
614
+ }
615
+ f = u[i][i];
616
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
617
+ h = f * g - s;
618
+ u[i][i] = f - g;
619
+ if (i < m-1)
620
+ { for (j = l; j < m; j++)
621
+ { s = 0.0;
622
+ for (k = i; k < n; k++) s += u[i][k] * u[j][k];
623
+ f = s / h;
624
+ for (k = i; k < n; k++) u[j][k] += f * u[i][k];
625
+ }
626
+ }
627
+ for (k = i; k < n; k++) u[i][k] *= scale;
628
+ }
629
+ w[i] = scale * g;
630
+ g = 0.0;
631
+ s = 0.0;
632
+ scale = 0.0;
633
+ if (i<m-1)
634
+ { for (k = l; k < m; k++) scale += fabs(u[k][i]);
635
+ if (scale != 0.0)
636
+ { for (k = l; k < m; k++)
637
+ { u[k][i] /= scale;
638
+ s += u[k][i] * u[k][i];
639
+ }
640
+ f = u[l][i];
641
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
642
+ h = f * g - s;
643
+ u[l][i] = f - g;
644
+ for (k = l; k < m; k++) rv1[k] = u[k][i] / h;
645
+ for (j = l; j < n; j++)
646
+ { s = 0.0;
647
+ for (k = l; k < m; k++) s += u[k][j] * u[k][i];
648
+ for (k = l; k < m; k++) u[k][j] += s * rv1[k];
649
+ }
650
+ for (k = l; k < m; k++) u[k][i] *= scale;
651
+ }
652
+ }
653
+ anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
654
+ }
655
+ /* accumulation of right-hand transformations */
656
+ for (i = m-1; i>=0; i--)
657
+ { if (i < m-1)
658
+ { if (g != 0.0)
659
+ { for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g;
660
+ /* double division avoids possible underflow */
661
+ for (j = l; j < m; j++)
662
+ { s = 0.0;
663
+ for (k = l; k < m; k++) s += u[k][i] * vt[k][j];
664
+ for (k = l; k < m; k++) vt[k][j] += s * vt[k][i];
665
+ }
666
+ }
667
+ }
668
+ for (j = l; j < m; j++)
669
+ { vt[i][j] = 0.0;
670
+ vt[j][i] = 0.0;
671
+ }
672
+ vt[i][i] = 1.0;
673
+ g = rv1[i];
674
+ l = i;
675
+ }
676
+ /* accumulation of left-hand transformations */
677
+ for (i = m-1; i >= 0; i--)
678
+ { l = i + 1;
679
+ g = w[i];
680
+ if (i!=m-1)
681
+ for (j = l; j < m; j++) u[j][i] = 0.0;
682
+ if (g!=0.0)
683
+ { if (i!=m-1)
684
+ { for (j = l; j < m; j++)
685
+ { s = 0.0;
686
+ for (k = l; k < n; k++) s += u[i][k] * u[j][k];
687
+ /* double division avoids possible underflow */
688
+ f = (s / u[i][i]) / g;
689
+ for (k = i; k < n; k++) u[j][k] += f * u[i][k];
690
+ }
691
+ }
692
+ for (j = i; j < n; j++) u[i][j] /= g;
693
+ }
694
+ else
695
+ for (j = i; j < n; j++) u[i][j] = 0.0;
696
+ u[i][i] += 1.0;
697
+ }
698
+ /* diagonalization of the bidiagonal form */
699
+ for (k = m-1; k >= 0; k--)
700
+ { k1 = k-1;
701
+ its = 0;
702
+ while(1)
703
+ /* test for splitting */
704
+ { for (l = k; l >= 0; l--)
705
+ { l1 = l-1;
706
+ if (fabs(rv1[l]) + anorm == anorm) break;
707
+ /* rv1[0] is always zero, so there is no exit
708
+ * through the bottom of the loop */
709
+ if (fabs(w[l1]) + anorm == anorm)
710
+ /* cancellation of rv1[l] if l greater than 0 */
711
+ { c = 0.0;
712
+ s = 1.0;
713
+ for (i = l; i <= k; i++)
714
+ { f = s * rv1[i];
715
+ rv1[i] *= c;
716
+ if (fabs(f) + anorm == anorm) break;
717
+ g = w[i];
718
+ h = sqrt(f*f+g*g);
719
+ w[i] = h;
720
+ c = g / h;
721
+ s = -f / h;
722
+ for (j = 0; j < n; j++)
723
+ { y = u[l1][j];
724
+ z = u[i][j];
725
+ u[l1][j] = y * c + z * s;
726
+ u[i][j] = -y * s + z * c;
727
+ }
728
+ }
729
+ break;
730
+ }
731
+ }
732
+ /* test for convergence */
733
+ z = w[k];
734
+ if (l==k) /* convergence */
735
+ { if (z < 0.0)
736
+ /* w[k] is made non-negative */
737
+ { w[k] = -z;
738
+ for (j = 0; j < m; j++) vt[j][k] = -vt[j][k];
739
+ }
740
+ break;
741
+ }
742
+ else if (its==30)
743
+ { ierr = k;
744
+ break;
745
+ }
746
+ else
747
+ /* shift from bottom 2 by 2 minor */
748
+ { its++;
749
+ x = w[l];
750
+ y = w[k1];
751
+ g = rv1[k1];
752
+ h = rv1[k];
753
+ f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
754
+ g = sqrt(f*f+1.0);
755
+ f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
756
+ /* next qr transformation */
757
+ c = 1.0;
758
+ s = 1.0;
759
+ for (i1 = l; i1 <= k1; i1++)
760
+ { i = i1 + 1;
761
+ g = rv1[i];
762
+ y = w[i];
763
+ h = s * g;
764
+ g = c * g;
765
+ z = sqrt(f*f+h*h);
766
+ rv1[i1] = z;
767
+ c = f / z;
768
+ s = h / z;
769
+ f = x * c + g * s;
770
+ g = -x * s + g * c;
771
+ h = y * s;
772
+ y = y * c;
773
+ for (j = 0; j < m; j++)
774
+ { x = vt[j][i1];
775
+ z = vt[j][i];
776
+ vt[j][i1] = x * c + z * s;
777
+ vt[j][i] = -x * s + z * c;
778
+ }
779
+ z = sqrt(f*f+h*h);
780
+ w[i1] = z;
781
+ /* rotation can be arbitrary if z is zero */
782
+ if (z!=0.0)
783
+ { c = f / z;
784
+ s = h / z;
785
+ }
786
+ f = c * g + s * y;
787
+ x = -s * g + c * y;
788
+ for (j = 0; j < n; j++)
789
+ { y = u[i1][j];
790
+ z = u[i][j];
791
+ u[i1][j] = y * c + z * s;
792
+ u[i][j] = -y * s + z * c;
793
+ }
794
+ }
795
+ rv1[l] = 0.0;
796
+ rv1[k] = f;
797
+ w[k] = x;
798
+ }
799
+ }
800
+ }
801
+ }
802
+ free(rv1);
803
+ return ierr;
804
+ }
805
+
806
+ /* ********************************************************************* */
807
+
808
+ int pca(int nrows, int ncolumns, double** u, double** v, double* w)
809
+ /*
810
+ Purpose
811
+ =======
812
+
813
+ This subroutine uses the singular value decomposition to perform principal
814
+ components analysis of a real nrows by ncolumns rectangular matrix.
815
+
816
+ Arguments
817
+ =========
818
+
819
+ nrows (input) int
820
+ The number of rows in the matrix u.
821
+
822
+ ncolumns (input) int
823
+ The number of columns in the matrix v.
824
+
825
+ u (input) double[nrows][ncolumns]
826
+ On input, the array containing the data to which the principal component
827
+ analysis should be applied. The function assumes that the mean has already been
828
+ subtracted of each column, and hence that the mean of each column is zero.
829
+ On output, see below.
830
+
831
+ v (input) double[n][n], where n = min(nrows, ncolumns)
832
+ Not used on input.
833
+
834
+ w (input) double[n], where n = min(nrows, ncolumns)
835
+ Not used on input.
836
+
837
+
838
+ Return value
839
+ ============
840
+
841
+ On output:
842
+
843
+ If nrows >= ncolumns, then
844
+
845
+ u contains the coordinates with respect to the principal components;
846
+ v contains the principal component vectors.
847
+
848
+ The dot product u . v reproduces the data that were passed in u.
849
+
850
+
851
+ If nrows < ncolumns, then
852
+
853
+ u contains the principal component vectors;
854
+ v contains the coordinates with respect to the principal components.
855
+
856
+ The dot product v . u reproduces the data that were passed in u.
857
+
858
+ The eigenvalues of the covariance matrix are returned in w.
859
+
860
+ The arrays u, v, and w are sorted according to eigenvalue, with the largest
861
+ eigenvalues appearing first.
862
+
863
+ The function returns 0 if successful, -1 if memory allocation fails, and a
864
+ positive integer if the singular value decomposition fails to converge.
865
+ */
866
+ {
867
+ int i;
868
+ int j;
869
+ int error;
870
+ int* index = malloc(ncolumns*sizeof(int));
871
+ double* temp = malloc(ncolumns*sizeof(double));
872
+ if (!index || !temp)
873
+ { if (index) free(index);
874
+ if (temp) free(temp);
875
+ return -1;
876
+ }
877
+ error = svd(nrows, ncolumns, u, w, v);
878
+ if (error==0)
879
+ {
880
+ if (nrows >= ncolumns)
881
+ { for (j = 0; j < ncolumns; j++)
882
+ { const double s = w[j];
883
+ for (i = 0; i < nrows; i++) u[i][j] *= s;
884
+ }
885
+ sort(ncolumns, w, index);
886
+ for (i = 0; i < ncolumns/2; i++)
887
+ { j = index[i];
888
+ index[i] = index[ncolumns-1-i];
889
+ index[ncolumns-1-i] = j;
890
+ }
891
+ for (i = 0; i < nrows; i++)
892
+ { for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]];
893
+ for (j = 0; j < ncolumns; j++) u[i][j] = temp[j];
894
+ }
895
+ for (i = 0; i < ncolumns; i++)
896
+ { for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
897
+ for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
898
+ }
899
+ for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
900
+ for (i = 0; i < ncolumns; i++) w[i] = temp[i];
901
+ }
902
+ else /* nrows < ncolumns */
903
+ { for (j = 0; j < nrows; j++)
904
+ { const double s = w[j];
905
+ for (i = 0; i < nrows; i++) v[i][j] *= s;
906
+ }
907
+ sort(nrows, w, index);
908
+ for (i = 0; i < nrows/2; i++)
909
+ { j = index[i];
910
+ index[i] = index[nrows-1-i];
911
+ index[nrows-1-i] = j;
912
+ }
913
+ for (j = 0; j < ncolumns; j++)
914
+ { for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
915
+ for (i = 0; i < nrows; i++) u[i][j] = temp[i];
916
+ }
917
+ for (j = 0; j < nrows; j++)
918
+ { for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
919
+ for (i = 0; i < nrows; i++) v[j][i] = temp[i];
920
+ }
921
+ for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
922
+ for (i = 0; i < nrows; i++) w[i] = temp[i];
923
+ }
924
+ }
925
+ free(index);
926
+ free(temp);
927
+ return error;
928
+ }
929
+
930
+ /* ********************************************************************* */
931
+
932
+ static
933
+ double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
934
+ const double weight[], int index1, int index2, int transpose)
935
+
936
+ /*
937
+ Purpose
938
+ =======
939
+
940
+ The euclid routine calculates the weighted Euclidean distance between two
941
+ rows or columns in a matrix.
942
+
943
+ Arguments
944
+ =========
945
+
946
+ n (input) int
947
+ The number of elements in a row or column. If transpose==0, then n is the number
948
+ of columns; otherwise, n is the number of rows.
949
+
950
+ data1 (input) double array
951
+ The data array containing the first vector.
952
+
953
+ data2 (input) double array
954
+ The data array containing the second vector.
955
+
956
+ mask1 (input) int array
957
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
958
+ data1[i][j] is missing.
959
+
960
+ mask2 (input) int array
961
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
962
+ data2[i][j] is missing.
963
+
964
+ weight (input) double[n]
965
+ The weights that are used to calculate the distance.
966
+
967
+ index1 (input) int
968
+ Index of the first row or column.
969
+
970
+ index2 (input) int
971
+ Index of the second row or column.
972
+
973
+ transpose (input) int
974
+ If transpose==0, the distance between two rows in the matrix is calculated.
975
+ Otherwise, the distance between two columns in the matrix is calculated.
976
+
977
+ ============================================================================
978
+ */
979
+ { double result = 0.;
980
+ double tweight = 0;
981
+ int i;
982
+ if (transpose==0) /* Calculate the distance between two rows */
983
+ { for (i = 0; i < n; i++)
984
+ { if (mask1[index1][i] && mask2[index2][i])
985
+ { double term = data1[index1][i] - data2[index2][i];
986
+ result += weight[i]*term*term;
987
+ tweight += weight[i];
988
+ }
989
+ }
990
+ }
991
+ else
992
+ { for (i = 0; i < n; i++)
993
+ { if (mask1[i][index1] && mask2[i][index2])
994
+ { double term = data1[i][index1] - data2[i][index2];
995
+ result += weight[i]*term*term;
996
+ tweight += weight[i];
997
+ }
998
+ }
999
+ }
1000
+ if (!tweight) return 0; /* usually due to empty clusters */
1001
+ result /= tweight;
1002
+ return result;
1003
+ }
1004
+
1005
+ /* ********************************************************************* */
1006
+
1007
+ static
1008
+ double cityblock (int n, double** data1, double** data2, int** mask1,
1009
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1010
+
1011
+ /*
1012
+ Purpose
1013
+ =======
1014
+
1015
+ The cityblock routine calculates the weighted "City Block" distance between
1016
+ two rows or columns in a matrix. City Block distance is defined as the
1017
+ absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
1018
+ equivalent to taking an "up and over" path.
1019
+
1020
+ Arguments
1021
+ =========
1022
+
1023
+ n (input) int
1024
+ The number of elements in a row or column. If transpose==0, then n is the number
1025
+ of columns; otherwise, n is the number of rows.
1026
+
1027
+ data1 (input) double array
1028
+ The data array containing the first vector.
1029
+
1030
+ data2 (input) double array
1031
+ The data array containing the second vector.
1032
+
1033
+ mask1 (input) int array
1034
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1035
+ data1[i][j] is missing.
1036
+
1037
+ mask2 (input) int array
1038
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1039
+ data2[i][j] is missing.
1040
+
1041
+ weight (input) double[n]
1042
+ The weights that are used to calculate the distance.
1043
+
1044
+ index1 (input) int
1045
+ Index of the first row or column.
1046
+
1047
+ index2 (input) int
1048
+ Index of the second row or column.
1049
+
1050
+ transpose (input) int
1051
+ If transpose==0, the distance between two rows in the matrix is calculated.
1052
+ Otherwise, the distance between two columns in the matrix is calculated.
1053
+
1054
+ ============================================================================ */
1055
+ { double result = 0.;
1056
+ double tweight = 0;
1057
+ int i;
1058
+ if (transpose==0) /* Calculate the distance between two rows */
1059
+ { for (i = 0; i < n; i++)
1060
+ { if (mask1[index1][i] && mask2[index2][i])
1061
+ { double term = data1[index1][i] - data2[index2][i];
1062
+ result = result + weight[i]*fabs(term);
1063
+ tweight += weight[i];
1064
+ }
1065
+ }
1066
+ }
1067
+ else
1068
+ { for (i = 0; i < n; i++)
1069
+ { if (mask1[i][index1] && mask2[i][index2])
1070
+ { double term = data1[i][index1] - data2[i][index2];
1071
+ result = result + weight[i]*fabs(term);
1072
+ tweight += weight[i];
1073
+ }
1074
+ }
1075
+ }
1076
+ if (!tweight) return 0; /* usually due to empty clusters */
1077
+ result /= tweight;
1078
+ return result;
1079
+ }
1080
+
1081
+ /* ********************************************************************* */
1082
+
1083
+ static
1084
+ double correlation (int n, double** data1, double** data2, int** mask1,
1085
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1086
+ /*
1087
+ Purpose
1088
+ =======
1089
+
1090
+ The correlation routine calculates the weighted Pearson distance between two
1091
+ rows or columns in a matrix. We define the Pearson distance as one minus the
1092
+ Pearson correlation.
1093
+ This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1094
+ but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1095
+ (e.g., choose b = a + c).
1096
+
1097
+ Arguments
1098
+ =========
1099
+
1100
+ n (input) int
1101
+ The number of elements in a row or column. If transpose==0, then n is the number
1102
+ of columns; otherwise, n is the number of rows.
1103
+
1104
+ data1 (input) double array
1105
+ The data array containing the first vector.
1106
+
1107
+ data2 (input) double array
1108
+ The data array containing the second vector.
1109
+
1110
+ mask1 (input) int array
1111
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1112
+ data1[i][j] is missing.
1113
+
1114
+ mask2 (input) int array
1115
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1116
+ data2[i][j] is missing.
1117
+
1118
+ weight (input) double[n]
1119
+ The weights that are used to calculate the distance.
1120
+
1121
+ index1 (input) int
1122
+ Index of the first row or column.
1123
+
1124
+ index2 (input) int
1125
+ Index of the second row or column.
1126
+
1127
+ transpose (input) int
1128
+ If transpose==0, the distance between two rows in the matrix is calculated.
1129
+ Otherwise, the distance between two columns in the matrix is calculated.
1130
+ ============================================================================
1131
+ */
1132
+ { double result = 0.;
1133
+ double sum1 = 0.;
1134
+ double sum2 = 0.;
1135
+ double denom1 = 0.;
1136
+ double denom2 = 0.;
1137
+ double tweight = 0.;
1138
+ if (transpose==0) /* Calculate the distance between two rows */
1139
+ { int i;
1140
+ for (i = 0; i < n; i++)
1141
+ { if (mask1[index1][i] && mask2[index2][i])
1142
+ { double term1 = data1[index1][i];
1143
+ double term2 = data2[index2][i];
1144
+ double w = weight[i];
1145
+ sum1 += w*term1;
1146
+ sum2 += w*term2;
1147
+ result += w*term1*term2;
1148
+ denom1 += w*term1*term1;
1149
+ denom2 += w*term2*term2;
1150
+ tweight += w;
1151
+ }
1152
+ }
1153
+ }
1154
+ else
1155
+ { int i;
1156
+ for (i = 0; i < n; i++)
1157
+ { if (mask1[i][index1] && mask2[i][index2])
1158
+ { double term1 = data1[i][index1];
1159
+ double term2 = data2[i][index2];
1160
+ double w = weight[i];
1161
+ sum1 += w*term1;
1162
+ sum2 += w*term2;
1163
+ result += w*term1*term2;
1164
+ denom1 += w*term1*term1;
1165
+ denom2 += w*term2*term2;
1166
+ tweight += w;
1167
+ }
1168
+ }
1169
+ }
1170
+ if (!tweight) return 0; /* usually due to empty clusters */
1171
+ result -= sum1 * sum2 / tweight;
1172
+ denom1 -= sum1 * sum1 / tweight;
1173
+ denom2 -= sum2 * sum2 / tweight;
1174
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1175
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1176
+ result = result / sqrt(denom1*denom2);
1177
+ result = 1. - result;
1178
+ return result;
1179
+ }
1180
+
1181
+ /* ********************************************************************* */
1182
+
1183
+ static
1184
+ double acorrelation (int n, double** data1, double** data2, int** mask1,
1185
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1186
+ /*
1187
+ Purpose
1188
+ =======
1189
+
1190
+ The acorrelation routine calculates the weighted Pearson distance between two
1191
+ rows or columns, using the absolute value of the correlation.
1192
+ This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1193
+ but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1194
+ (e.g., choose b = a + c).
1195
+
1196
+ Arguments
1197
+ =========
1198
+
1199
+ n (input) int
1200
+ The number of elements in a row or column. If transpose==0, then n is the number
1201
+ of columns; otherwise, n is the number of rows.
1202
+
1203
+ data1 (input) double array
1204
+ The data array containing the first vector.
1205
+
1206
+ data2 (input) double array
1207
+ The data array containing the second vector.
1208
+
1209
+ mask1 (input) int array
1210
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1211
+ data1[i][j] is missing.
1212
+
1213
+ mask2 (input) int array
1214
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1215
+ data2[i][j] is missing.
1216
+
1217
+ weight (input) double[n]
1218
+ The weights that are used to calculate the distance.
1219
+
1220
+ index1 (input) int
1221
+ Index of the first row or column.
1222
+
1223
+ index2 (input) int
1224
+ Index of the second row or column.
1225
+
1226
+ transpose (input) int
1227
+ If transpose==0, the distance between two rows in the matrix is calculated.
1228
+ Otherwise, the distance between two columns in the matrix is calculated.
1229
+ ============================================================================
1230
+ */
1231
+ { double result = 0.;
1232
+ double sum1 = 0.;
1233
+ double sum2 = 0.;
1234
+ double denom1 = 0.;
1235
+ double denom2 = 0.;
1236
+ double tweight = 0.;
1237
+ if (transpose==0) /* Calculate the distance between two rows */
1238
+ { int i;
1239
+ for (i = 0; i < n; i++)
1240
+ { if (mask1[index1][i] && mask2[index2][i])
1241
+ { double term1 = data1[index1][i];
1242
+ double term2 = data2[index2][i];
1243
+ double w = weight[i];
1244
+ sum1 += w*term1;
1245
+ sum2 += w*term2;
1246
+ result += w*term1*term2;
1247
+ denom1 += w*term1*term1;
1248
+ denom2 += w*term2*term2;
1249
+ tweight += w;
1250
+ }
1251
+ }
1252
+ }
1253
+ else
1254
+ { int i;
1255
+ for (i = 0; i < n; i++)
1256
+ { if (mask1[i][index1] && mask2[i][index2])
1257
+ { double term1 = data1[i][index1];
1258
+ double term2 = data2[i][index2];
1259
+ double w = weight[i];
1260
+ sum1 += w*term1;
1261
+ sum2 += w*term2;
1262
+ result += w*term1*term2;
1263
+ denom1 += w*term1*term1;
1264
+ denom2 += w*term2*term2;
1265
+ tweight += w;
1266
+ }
1267
+ }
1268
+ }
1269
+ if (!tweight) return 0; /* usually due to empty clusters */
1270
+ result -= sum1 * sum2 / tweight;
1271
+ denom1 -= sum1 * sum1 / tweight;
1272
+ denom2 -= sum2 * sum2 / tweight;
1273
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1274
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1275
+ result = fabs(result) / sqrt(denom1*denom2);
1276
+ result = 1. - result;
1277
+ return result;
1278
+ }
1279
+
1280
+ /* ********************************************************************* */
1281
+
1282
+ static
1283
+ double ucorrelation (int n, double** data1, double** data2, int** mask1,
1284
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1285
+ /*
1286
+ Purpose
1287
+ =======
1288
+
1289
+ The ucorrelation routine calculates the weighted Pearson distance between two
1290
+ rows or columns, using the uncentered version of the Pearson correlation. In the
1291
+ uncentered Pearson correlation, a zero mean is used for both vectors even if
1292
+ the actual mean is nonzero.
1293
+ This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1294
+ but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1295
+ (e.g., choose b = a + c).
1296
+
1297
+ Arguments
1298
+ =========
1299
+
1300
+ n (input) int
1301
+ The number of elements in a row or column. If transpose==0, then n is the number
1302
+ of columns; otherwise, n is the number of rows.
1303
+
1304
+ data1 (input) double array
1305
+ The data array containing the first vector.
1306
+
1307
+ data2 (input) double array
1308
+ The data array containing the second vector.
1309
+
1310
+ mask1 (input) int array
1311
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1312
+ data1[i][j] is missing.
1313
+
1314
+ mask2 (input) int array
1315
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1316
+ data2[i][j] is missing.
1317
+
1318
+ weight (input) double[n]
1319
+ The weights that are used to calculate the distance.
1320
+
1321
+ index1 (input) int
1322
+ Index of the first row or column.
1323
+
1324
+ index2 (input) int
1325
+ Index of the second row or column.
1326
+
1327
+ transpose (input) int
1328
+ If transpose==0, the distance between two rows in the matrix is calculated.
1329
+ Otherwise, the distance between two columns in the matrix is calculated.
1330
+ ============================================================================
1331
+ */
1332
+ { double result = 0.;
1333
+ double denom1 = 0.;
1334
+ double denom2 = 0.;
1335
+ int flag = 0;
1336
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1337
+ * found.
1338
+ */
1339
+ if (transpose==0) /* Calculate the distance between two rows */
1340
+ { int i;
1341
+ for (i = 0; i < n; i++)
1342
+ { if (mask1[index1][i] && mask2[index2][i])
1343
+ { double term1 = data1[index1][i];
1344
+ double term2 = data2[index2][i];
1345
+ double w = weight[i];
1346
+ result += w*term1*term2;
1347
+ denom1 += w*term1*term1;
1348
+ denom2 += w*term2*term2;
1349
+ flag = 1;
1350
+ }
1351
+ }
1352
+ }
1353
+ else
1354
+ { int i;
1355
+ for (i = 0; i < n; i++)
1356
+ { if (mask1[i][index1] && mask2[i][index2])
1357
+ { double term1 = data1[i][index1];
1358
+ double term2 = data2[i][index2];
1359
+ double w = weight[i];
1360
+ result += w*term1*term2;
1361
+ denom1 += w*term1*term1;
1362
+ denom2 += w*term2*term2;
1363
+ flag = 1;
1364
+ }
1365
+ }
1366
+ }
1367
+ if (!flag) return 0.;
1368
+ if (denom1==0.) return 1.;
1369
+ if (denom2==0.) return 1.;
1370
+ result = result / sqrt(denom1*denom2);
1371
+ result = 1. - result;
1372
+ return result;
1373
+ }
1374
+
1375
+ /* ********************************************************************* */
1376
+
1377
+ static
1378
+ double uacorrelation (int n, double** data1, double** data2, int** mask1,
1379
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1380
+ /*
1381
+ Purpose
1382
+ =======
1383
+
1384
+ The uacorrelation routine calculates the weighted Pearson distance between two
1385
+ rows or columns, using the absolute value of the uncentered version of the
1386
+ Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
1387
+ for both vectors even if the actual mean is nonzero.
1388
+ This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1389
+ but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1390
+ (e.g., choose b = a + c).
1391
+
1392
+ Arguments
1393
+ =========
1394
+
1395
+ n (input) int
1396
+ The number of elements in a row or column. If transpose==0, then n is the number
1397
+ of columns; otherwise, n is the number of rows.
1398
+
1399
+ data1 (input) double array
1400
+ The data array containing the first vector.
1401
+
1402
+ data2 (input) double array
1403
+ The data array containing the second vector.
1404
+
1405
+ mask1 (input) int array
1406
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1407
+ data1[i][j] is missing.
1408
+
1409
+ mask2 (input) int array
1410
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1411
+ data2[i][j] is missing.
1412
+
1413
+ weight (input) double[n]
1414
+ The weights that are used to calculate the distance.
1415
+
1416
+ index1 (input) int
1417
+ Index of the first row or column.
1418
+
1419
+ index2 (input) int
1420
+ Index of the second row or column.
1421
+
1422
+ transpose (input) int
1423
+ If transpose==0, the distance between two rows in the matrix is calculated.
1424
+ Otherwise, the distance between two columns in the matrix is calculated.
1425
+ ============================================================================
1426
+ */
1427
+ { double result = 0.;
1428
+ double denom1 = 0.;
1429
+ double denom2 = 0.;
1430
+ int flag = 0;
1431
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1432
+ * found.
1433
+ */
1434
+ if (transpose==0) /* Calculate the distance between two rows */
1435
+ { int i;
1436
+ for (i = 0; i < n; i++)
1437
+ { if (mask1[index1][i] && mask2[index2][i])
1438
+ { double term1 = data1[index1][i];
1439
+ double term2 = data2[index2][i];
1440
+ double w = weight[i];
1441
+ result += w*term1*term2;
1442
+ denom1 += w*term1*term1;
1443
+ denom2 += w*term2*term2;
1444
+ flag = 1;
1445
+ }
1446
+ }
1447
+ }
1448
+ else
1449
+ { int i;
1450
+ for (i = 0; i < n; i++)
1451
+ { if (mask1[i][index1] && mask2[i][index2])
1452
+ { double term1 = data1[i][index1];
1453
+ double term2 = data2[i][index2];
1454
+ double w = weight[i];
1455
+ result += w*term1*term2;
1456
+ denom1 += w*term1*term1;
1457
+ denom2 += w*term2*term2;
1458
+ flag = 1;
1459
+ }
1460
+ }
1461
+ }
1462
+ if (!flag) return 0.;
1463
+ if (denom1==0.) return 1.;
1464
+ if (denom2==0.) return 1.;
1465
+ result = fabs(result) / sqrt(denom1*denom2);
1466
+ result = 1. - result;
1467
+ return result;
1468
+ }
1469
+
1470
+ /* ********************************************************************* */
1471
+
1472
+ static
1473
+ double spearman (int n, double** data1, double** data2, int** mask1,
1474
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1475
+ /*
1476
+ Purpose
1477
+ =======
1478
+
1479
+ The spearman routine calculates the Spearman distance between two rows or
1480
+ columns. The Spearman distance is defined as one minus the Spearman rank
1481
+ correlation.
1482
+
1483
+ Arguments
1484
+ =========
1485
+
1486
+ n (input) int
1487
+ The number of elements in a row or column. If transpose==0, then n is the number
1488
+ of columns; otherwise, n is the number of rows.
1489
+
1490
+ data1 (input) double array
1491
+ The data array containing the first vector.
1492
+
1493
+ data2 (input) double array
1494
+ The data array containing the second vector.
1495
+
1496
+ mask1 (input) int array
1497
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1498
+ data1[i][j] is missing.
1499
+
1500
+ mask2 (input) int array
1501
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1502
+ data2[i][j] is missing.
1503
+
1504
+ weight (input) double[n]
1505
+ These weights are ignored, but included for consistency with other distance
1506
+ measures.
1507
+
1508
+ index1 (input) int
1509
+ Index of the first row or column.
1510
+
1511
+ index2 (input) int
1512
+ Index of the second row or column.
1513
+
1514
+ transpose (input) int
1515
+ If transpose==0, the distance between two rows in the matrix is calculated.
1516
+ Otherwise, the distance between two columns in the matrix is calculated.
1517
+ ============================================================================
1518
+ */
1519
+ { int i;
1520
+ int m = 0;
1521
+ double* rank1;
1522
+ double* rank2;
1523
+ double result = 0.;
1524
+ double denom1 = 0.;
1525
+ double denom2 = 0.;
1526
+ double avgrank;
1527
+ double* tdata1;
1528
+ double* tdata2;
1529
+ tdata1 = malloc(n*sizeof(double));
1530
+ if(!tdata1) return 0.0; /* Memory allocation error */
1531
+ tdata2 = malloc(n*sizeof(double));
1532
+ if(!tdata2) /* Memory allocation error */
1533
+ { free(tdata1);
1534
+ return 0.0;
1535
+ }
1536
+ if (transpose==0)
1537
+ { for (i = 0; i < n; i++)
1538
+ { if (mask1[index1][i] && mask2[index2][i])
1539
+ { tdata1[m] = data1[index1][i];
1540
+ tdata2[m] = data2[index2][i];
1541
+ m++;
1542
+ }
1543
+ }
1544
+ }
1545
+ else
1546
+ { for (i = 0; i < n; i++)
1547
+ { if (mask1[i][index1] && mask2[i][index2])
1548
+ { tdata1[m] = data1[i][index1];
1549
+ tdata2[m] = data2[i][index2];
1550
+ m++;
1551
+ }
1552
+ }
1553
+ }
1554
+ if (m==0)
1555
+ { free(tdata1);
1556
+ free(tdata2);
1557
+ return 0;
1558
+ }
1559
+ rank1 = getrank(m, tdata1);
1560
+ free(tdata1);
1561
+ if(!rank1)
1562
+ { free(tdata2);
1563
+ return 0.0; /* Memory allocation error */
1564
+ }
1565
+ rank2 = getrank(m, tdata2);
1566
+ free(tdata2);
1567
+ if(!rank2) /* Memory allocation error */
1568
+ { free(rank1);
1569
+ return 0.0;
1570
+ }
1571
+ avgrank = 0.5*(m-1); /* Average rank */
1572
+ for (i = 0; i < m; i++)
1573
+ { const double value1 = rank1[i];
1574
+ const double value2 = rank2[i];
1575
+ result += value1 * value2;
1576
+ denom1 += value1 * value1;
1577
+ denom2 += value2 * value2;
1578
+ }
1579
+ /* Note: denom1 and denom2 cannot be calculated directly from the number
1580
+ * of elements. If two elements have the same rank, the squared sum of
1581
+ * their ranks will change.
1582
+ */
1583
+ free(rank1);
1584
+ free(rank2);
1585
+ result /= m;
1586
+ denom1 /= m;
1587
+ denom2 /= m;
1588
+ result -= avgrank * avgrank;
1589
+ denom1 -= avgrank * avgrank;
1590
+ denom2 -= avgrank * avgrank;
1591
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1592
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1593
+ result = result / sqrt(denom1*denom2);
1594
+ result = 1. - result;
1595
+ return result;
1596
+ }
1597
+
1598
+ /* ********************************************************************* */
1599
+
1600
+ static
1601
+ double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
1602
+ const double weight[], int index1, int index2, int transpose)
1603
+ /*
1604
+ Purpose
1605
+ =======
1606
+
1607
+ The kendall routine calculates the Kendall distance between two
1608
+ rows or columns. The Kendall distance is defined as one minus Kendall's tau.
1609
+
1610
+ Arguments
1611
+ =========
1612
+
1613
+ n (input) int
1614
+ The number of elements in a row or column. If transpose==0, then n is the number
1615
+ of columns; otherwise, n is the number of rows.
1616
+
1617
+ data1 (input) double array
1618
+ The data array containing the first vector.
1619
+
1620
+ data2 (input) double array
1621
+ The data array containing the second vector.
1622
+
1623
+ mask1 (input) int array
1624
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1625
+ data1[i][j] is missing.
1626
+
1627
+ mask2 (input) int array
1628
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1629
+ data2[i][j] is missing.
1630
+
1631
+ weight (input) double[n]
1632
+ These weights are ignored, but included for consistency with other distance
1633
+ measures.
1634
+
1635
+ index1 (input) int
1636
+ Index of the first row or column.
1637
+
1638
+ index2 (input) int
1639
+ Index of the second row or column.
1640
+
1641
+ transpose (input) int
1642
+ If transpose==0, the distance between two rows in the matrix is calculated.
1643
+ Otherwise, the distance between two columns in the matrix is calculated.
1644
+ ============================================================================
1645
+ */
1646
+ { int con = 0;
1647
+ int dis = 0;
1648
+ int exx = 0;
1649
+ int exy = 0;
1650
+ int flag = 0;
1651
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1652
+ * found.
1653
+ */
1654
+ double denomx;
1655
+ double denomy;
1656
+ double tau;
1657
+ int i, j;
1658
+ if (transpose==0)
1659
+ { for (i = 0; i < n; i++)
1660
+ { if (mask1[index1][i] && mask2[index2][i])
1661
+ { for (j = 0; j < i; j++)
1662
+ { if (mask1[index1][j] && mask2[index2][j])
1663
+ { double x1 = data1[index1][i];
1664
+ double x2 = data1[index1][j];
1665
+ double y1 = data2[index2][i];
1666
+ double y2 = data2[index2][j];
1667
+ if (x1 < x2 && y1 < y2) con++;
1668
+ if (x1 > x2 && y1 > y2) con++;
1669
+ if (x1 < x2 && y1 > y2) dis++;
1670
+ if (x1 > x2 && y1 < y2) dis++;
1671
+ if (x1 == x2 && y1 != y2) exx++;
1672
+ if (x1 != x2 && y1 == y2) exy++;
1673
+ flag = 1;
1674
+ }
1675
+ }
1676
+ }
1677
+ }
1678
+ }
1679
+ else
1680
+ { for (i = 0; i < n; i++)
1681
+ { if (mask1[i][index1] && mask2[i][index2])
1682
+ { for (j = 0; j < i; j++)
1683
+ { if (mask1[j][index1] && mask2[j][index2])
1684
+ { double x1 = data1[i][index1];
1685
+ double x2 = data1[j][index1];
1686
+ double y1 = data2[i][index2];
1687
+ double y2 = data2[j][index2];
1688
+ if (x1 < x2 && y1 < y2) con++;
1689
+ if (x1 > x2 && y1 > y2) con++;
1690
+ if (x1 < x2 && y1 > y2) dis++;
1691
+ if (x1 > x2 && y1 < y2) dis++;
1692
+ if (x1 == x2 && y1 != y2) exx++;
1693
+ if (x1 != x2 && y1 == y2) exy++;
1694
+ flag = 1;
1695
+ }
1696
+ }
1697
+ }
1698
+ }
1699
+ }
1700
+ if (!flag) return 0.;
1701
+ denomx = con + dis + exx;
1702
+ denomy = con + dis + exy;
1703
+ if (denomx==0) return 1;
1704
+ if (denomy==0) return 1;
1705
+ tau = (con-dis)/sqrt(denomx*denomy);
1706
+ return 1.-tau;
1707
+ }
1708
+
1709
+ /* ********************************************************************* */
1710
+
1711
+ static double(*setmetric(char dist))
1712
+ (int, double**, double**, int**, int**, const double[], int, int, int)
1713
+ { switch(dist)
1714
+ { case 'e': return &euclid;
1715
+ case 'b': return &cityblock;
1716
+ case 'c': return &correlation;
1717
+ case 'a': return &acorrelation;
1718
+ case 'u': return &ucorrelation;
1719
+ case 'x': return &uacorrelation;
1720
+ case 's': return &spearman;
1721
+ case 'k': return &kendall;
1722
+ default: return &euclid;
1723
+ }
1724
+ return NULL; /* Never get here */
1725
+ }
1726
+
1727
+ /* ********************************************************************* */
1728
+
1729
+ static double uniform(void)
1730
+ /*
1731
+ Purpose
1732
+ =======
1733
+
1734
+ This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
1735
+ and 1.0 are excluded. This random number generator is described in:
1736
+
1737
+ Pierre l'Ecuyer
1738
+ Efficient and Portable Combined Random Number Generators
1739
+ Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749,774.
1740
+
1741
+ The first time this routine is called, it initializes the random number
1742
+ generator using the current time. First, the current epoch time in seconds is
1743
+ used as a seed for the random number generator in the C library. The first two
1744
+ random numbers generated by this generator are used to initialize the random
1745
+ number generator implemented in this routine.
1746
+
1747
+
1748
+ Arguments
1749
+ =========
1750
+
1751
+ None.
1752
+
1753
+
1754
+ Return value
1755
+ ============
1756
+
1757
+ A double-precison number between 0.0 and 1.0.
1758
+ ============================================================================
1759
+ */
1760
+ { int z;
1761
+ static const int m1 = 2147483563;
1762
+ static const int m2 = 2147483399;
1763
+ const double scale = 1.0/m1;
1764
+
1765
+ static int s1 = 0;
1766
+ static int s2 = 0;
1767
+
1768
+ if (s1==0 || s2==0) /* initialize */
1769
+ { unsigned int initseed = (unsigned int) time(0);
1770
+ srand(initseed);
1771
+ s1 = rand();
1772
+ s2 = rand();
1773
+ }
1774
+
1775
+ do
1776
+ { int k;
1777
+ k = s1/53668;
1778
+ s1 = 40014*(s1-k*53668)-k*12211;
1779
+ if (s1 < 0) s1+=m1;
1780
+ k = s2/52774;
1781
+ s2 = 40692*(s2-k*52774)-k*3791;
1782
+ if(s2 < 0) s2+=m2;
1783
+ z = s1-s2;
1784
+ if(z < 1) z+=(m1-1);
1785
+ } while (z==m1); /* To avoid returning 1.0 */
1786
+
1787
+ return z*scale;
1788
+ }
1789
+
1790
+ /* ************************************************************************ */
1791
+
1792
+ static int binomial(int n, double p)
1793
+ /*
1794
+ Purpose
1795
+ =======
1796
+
1797
+ This routine generates a random number between 0 and n inclusive, following
1798
+ the binomial distribution with probability p and n trials. The routine is
1799
+ based on the BTPE algorithm, described in:
1800
+
1801
+ Voratas Kachitvichyanukul and Bruce W. Schmeiser:
1802
+ Binomial Random Variate Generation
1803
+ Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
1804
+
1805
+
1806
+ Arguments
1807
+ =========
1808
+
1809
+ p (input) double
1810
+ The probability of a single event. This probability should be less than or
1811
+ equal to 0.5.
1812
+
1813
+ n (input) int
1814
+ The number of trials.
1815
+
1816
+
1817
+ Return value
1818
+ ============
1819
+
1820
+ An integer drawn from a binomial distribution with parameters (p, n).
1821
+
1822
+ ============================================================================
1823
+ */
1824
+ { const double q = 1 - p;
1825
+ if (n*p < 30.0) /* Algorithm BINV */
1826
+ { const double s = p/q;
1827
+ const double a = (n+1)*s;
1828
+ double r = exp(n*log(q)); /* pow() causes a crash on AIX */
1829
+ int x = 0;
1830
+ double u = uniform();
1831
+ while(1)
1832
+ { if (u < r) return x;
1833
+ u-=r;
1834
+ x++;
1835
+ r *= (a/x)-s;
1836
+ }
1837
+ }
1838
+ else /* Algorithm BTPE */
1839
+ { /* Step 0 */
1840
+ const double fm = n*p + p;
1841
+ const int m = (int) fm;
1842
+ const double p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5;
1843
+ const double xm = m + 0.5;
1844
+ const double xl = xm - p1;
1845
+ const double xr = xm + p1;
1846
+ const double c = 0.134 + 20.5/(15.3+m);
1847
+ const double a = (fm-xl)/(fm-xl*p);
1848
+ const double b = (xr-fm)/(xr*q);
1849
+ const double lambdal = a*(1.0+0.5*a);
1850
+ const double lambdar = b*(1.0+0.5*b);
1851
+ const double p2 = p1*(1+2*c);
1852
+ const double p3 = p2 + c/lambdal;
1853
+ const double p4 = p3 + c/lambdar;
1854
+ while (1)
1855
+ { /* Step 1 */
1856
+ int y;
1857
+ int k;
1858
+ double u = uniform();
1859
+ double v = uniform();
1860
+ u *= p4;
1861
+ if (u <= p1) return (int)(xm-p1*v+u);
1862
+ /* Step 2 */
1863
+ if (u > p2)
1864
+ { /* Step 3 */
1865
+ if (u > p3)
1866
+ { /* Step 4 */
1867
+ y = (int)(xr-log(v)/lambdar);
1868
+ if (y > n) continue;
1869
+ /* Go to step 5 */
1870
+ v = v*(u-p3)*lambdar;
1871
+ }
1872
+ else
1873
+ { y = (int)(xl+log(v)/lambdal);
1874
+ if (y < 0) continue;
1875
+ /* Go to step 5 */
1876
+ v = v*(u-p2)*lambdal;
1877
+ }
1878
+ }
1879
+ else
1880
+ { const double x = xl + (u-p1)/c;
1881
+ v = v*c + 1.0 - fabs(m-x+0.5)/p1;
1882
+ if (v > 1) continue;
1883
+ /* Go to step 5 */
1884
+ y = (int)x;
1885
+ }
1886
+ /* Step 5 */
1887
+ /* Step 5.0 */
1888
+ k = abs(y-m);
1889
+ if (k > 20 && k < 0.5*n*p*q-1.0)
1890
+ { /* Step 5.2 */
1891
+ double rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) + 0.1666666666666)/(n*p*q)+0.5);
1892
+ double t = -k*k/(2*n*p*q);
1893
+ double A = log(v);
1894
+ if (A < t-rho) return y;
1895
+ else if (A > t+rho) continue;
1896
+ else
1897
+ { /* Step 5.3 */
1898
+ double x1 = y+1;
1899
+ double f1 = m+1;
1900
+ double z = n+1-m;
1901
+ double w = n-y+1;
1902
+ double x2 = x1*x1;
1903
+ double f2 = f1*f1;
1904
+ double z2 = z*z;
1905
+ double w2 = w*w;
1906
+ if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
1907
+ + (y-m)*log(w*p/(x1*q))
1908
+ + (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
1909
+ + (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
1910
+ + (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
1911
+ + (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
1912
+ continue;
1913
+ return y;
1914
+ }
1915
+ }
1916
+ else
1917
+ { /* Step 5.1 */
1918
+ int i;
1919
+ const double s = p/q;
1920
+ const double aa = s*(n+1);
1921
+ double f = 1.0;
1922
+ for (i = m; i < y; f *= (aa/(++i)-s));
1923
+ for (i = y; i < m; f /= (aa/(++i)-s));
1924
+ if (v > f) continue;
1925
+ return y;
1926
+ }
1927
+ }
1928
+ }
1929
+ /* Never get here */
1930
+ return -1;
1931
+ }
1932
+
1933
+ /* ************************************************************************ */
1934
+
1935
+ static void randomassign (int nclusters, int nelements, int clusterid[])
1936
+ /*
1937
+ Purpose
1938
+ =======
1939
+
1940
+ The randomassign routine performs an initial random clustering, needed for
1941
+ k-means or k-median clustering. Elements (genes or microarrays) are randomly
1942
+ assigned to clusters. The number of elements in each cluster is chosen
1943
+ randomly, making sure that each cluster will receive at least one element.
1944
+
1945
+
1946
+ Arguments
1947
+ =========
1948
+
1949
+ nclusters (input) int
1950
+ The number of clusters.
1951
+
1952
+ nelements (input) int
1953
+ The number of elements to be clustered (i.e., the number of genes or microarrays
1954
+ to be clustered).
1955
+
1956
+ clusterid (output) int[nelements]
1957
+ The cluster number to which an element was assigned.
1958
+
1959
+ ============================================================================
1960
+ */
1961
+ { int i, j;
1962
+ int k = 0;
1963
+ double p;
1964
+ int n = nelements-nclusters;
1965
+ /* Draw the number of elements in each cluster from a multinomial
1966
+ * distribution, reserving ncluster elements to set independently
1967
+ * in order to guarantee that none of the clusters are empty.
1968
+ */
1969
+ for (i = 0; i < nclusters-1; i++)
1970
+ { p = 1.0/(nclusters-i);
1971
+ j = binomial(n, p);
1972
+ n -= j;
1973
+ j += k+1; /* Assign at least one element to cluster i */
1974
+ for ( ; k < j; k++) clusterid[k] = i;
1975
+ }
1976
+ /* Assign the remaining elements to the last cluster */
1977
+ for ( ; k < nelements; k++) clusterid[k] = i;
1978
+
1979
+ /* Create a random permutation of the cluster assignments */
1980
+ for (i = 0; i < nelements; i++)
1981
+ { j = (int) (i + (nelements-i)*uniform());
1982
+ k = clusterid[j];
1983
+ clusterid[j] = clusterid[i];
1984
+ clusterid[i] = k;
1985
+ }
1986
+
1987
+ return;
1988
+ }
1989
+
1990
+ /* ********************************************************************* */
1991
+
1992
+ static void getclustermeans(int nclusters, int nrows, int ncolumns,
1993
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
1994
+ int transpose)
1995
+ /*
1996
+ Purpose
1997
+ =======
1998
+
1999
+ The getclustermeans routine calculates the cluster centroids, given to which
2000
+ cluster each element belongs. The centroid is defined as the mean over all
2001
+ elements for each dimension.
2002
+
2003
+ Arguments
2004
+ =========
2005
+
2006
+ nclusters (input) int
2007
+ The number of clusters.
2008
+
2009
+ nrows (input) int
2010
+ The number of rows in the gene expression data matrix, equal to the number of
2011
+ genes.
2012
+
2013
+ ncolumns (input) int
2014
+ The number of columns in the gene expression data matrix, equal to the number of
2015
+ microarrays.
2016
+
2017
+ data (input) double[nrows][ncolumns]
2018
+ The array containing the gene expression data.
2019
+
2020
+ mask (input) int[nrows][ncolumns]
2021
+ This array shows which data values are missing. If mask[i][j]==0, then
2022
+ data[i][j] is missing.
2023
+
2024
+ clusterid (output) int[nrows] if transpose==0
2025
+ int[ncolumns] if transpose==1
2026
+ The cluster number to which each element belongs. If transpose==0, then the
2027
+ dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2028
+ is equal to ncolumns (the number of microarrays).
2029
+
2030
+ cdata (output) double[nclusters][ncolumns] if transpose==0
2031
+ double[nrows][nclusters] if transpose==1
2032
+ On exit of getclustermeans, this array contains the cluster centroids.
2033
+
2034
+ cmask (output) int[nclusters][ncolumns] if transpose==0
2035
+ int[nrows][nclusters] if transpose==1
2036
+ This array shows which data values of are missing for each centroid. If
2037
+ cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2038
+ a centroid if all corresponding data values of the cluster members are missing.
2039
+
2040
+ transpose (input) int
2041
+ If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2042
+ columns (microarrays) are specified.
2043
+
2044
+ ========================================================================
2045
+ */
2046
+ { int i, j, k;
2047
+ if (transpose==0)
2048
+ { for (i = 0; i < nclusters; i++)
2049
+ { for (j = 0; j < ncolumns; j++)
2050
+ { cmask[i][j] = 0;
2051
+ cdata[i][j] = 0.;
2052
+ }
2053
+ }
2054
+ for (k = 0; k < nrows; k++)
2055
+ { i = clusterid[k];
2056
+ for (j = 0; j < ncolumns; j++)
2057
+ { if (mask[k][j] != 0)
2058
+ { cdata[i][j]+=data[k][j];
2059
+ cmask[i][j]++;
2060
+ }
2061
+ }
2062
+ }
2063
+ for (i = 0; i < nclusters; i++)
2064
+ { for (j = 0; j < ncolumns; j++)
2065
+ { if (cmask[i][j]>0)
2066
+ { cdata[i][j] /= cmask[i][j];
2067
+ cmask[i][j] = 1;
2068
+ }
2069
+ }
2070
+ }
2071
+ }
2072
+ else
2073
+ { for (i = 0; i < nrows; i++)
2074
+ { for (j = 0; j < nclusters; j++)
2075
+ { cdata[i][j] = 0.;
2076
+ cmask[i][j] = 0;
2077
+ }
2078
+ }
2079
+ for (k = 0; k < ncolumns; k++)
2080
+ { i = clusterid[k];
2081
+ for (j = 0; j < nrows; j++)
2082
+ { if (mask[j][k] != 0)
2083
+ { cdata[j][i]+=data[j][k];
2084
+ cmask[j][i]++;
2085
+ }
2086
+ }
2087
+ }
2088
+ for (i = 0; i < nrows; i++)
2089
+ { for (j = 0; j < nclusters; j++)
2090
+ { if (cmask[i][j]>0)
2091
+ { cdata[i][j] /= cmask[i][j];
2092
+ cmask[i][j] = 1;
2093
+ }
2094
+ }
2095
+ }
2096
+ }
2097
+ }
2098
+
2099
+ /* ********************************************************************* */
2100
+
2101
+ static void
2102
+ getclustermedians(int nclusters, int nrows, int ncolumns,
2103
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
2104
+ int transpose, double cache[])
2105
+ /*
2106
+ Purpose
2107
+ =======
2108
+
2109
+ The getclustermedians routine calculates the cluster centroids, given to which
2110
+ cluster each element belongs. The centroid is defined as the median over all
2111
+ elements for each dimension.
2112
+
2113
+ Arguments
2114
+ =========
2115
+
2116
+ nclusters (input) int
2117
+ The number of clusters.
2118
+
2119
+ nrows (input) int
2120
+ The number of rows in the gene expression data matrix, equal to the number of
2121
+ genes.
2122
+
2123
+ ncolumns (input) int
2124
+ The number of columns in the gene expression data matrix, equal to the number of
2125
+ microarrays.
2126
+
2127
+ data (input) double[nrows][ncolumns]
2128
+ The array containing the gene expression data.
2129
+
2130
+ mask (input) int[nrows][ncolumns]
2131
+ This array shows which data values are missing. If mask[i][j]==0, then
2132
+ data[i][j] is missing.
2133
+
2134
+ clusterid (output) int[nrows] if transpose==0
2135
+ int[ncolumns] if transpose==1
2136
+ The cluster number to which each element belongs. If transpose==0, then the
2137
+ dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2138
+ is equal to ncolumns (the number of microarrays).
2139
+
2140
+ cdata (output) double[nclusters][ncolumns] if transpose==0
2141
+ double[nrows][nclusters] if transpose==1
2142
+ On exit of getclustermedians, this array contains the cluster centroids.
2143
+
2144
+ cmask (output) int[nclusters][ncolumns] if transpose==0
2145
+ int[nrows][nclusters] if transpose==1
2146
+ This array shows which data values of are missing for each centroid. If
2147
+ cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2148
+ a centroid if all corresponding data values of the cluster members are missing.
2149
+
2150
+ transpose (input) int
2151
+ If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2152
+ columns (microarrays) are specified.
2153
+
2154
+ cache (input) double[nrows] if transpose==0
2155
+ double[ncolumns] if transpose==1
2156
+ This array should be allocated before calling getclustermedians; its contents
2157
+ on input is not relevant. This array is used as a temporary storage space when
2158
+ calculating the medians.
2159
+
2160
+ ========================================================================
2161
+ */
2162
+ { int i, j, k;
2163
+ if (transpose==0)
2164
+ { for (i = 0; i < nclusters; i++)
2165
+ { for (j = 0; j < ncolumns; j++)
2166
+ { int count = 0;
2167
+ for (k = 0; k < nrows; k++)
2168
+ { if (i==clusterid[k] && mask[k][j])
2169
+ { cache[count] = data[k][j];
2170
+ count++;
2171
+ }
2172
+ }
2173
+ if (count>0)
2174
+ { cdata[i][j] = median(count,cache);
2175
+ cmask[i][j] = 1;
2176
+ }
2177
+ else
2178
+ { cdata[i][j] = 0.;
2179
+ cmask[i][j] = 0;
2180
+ }
2181
+ }
2182
+ }
2183
+ }
2184
+ else
2185
+ { for (i = 0; i < nclusters; i++)
2186
+ { for (j = 0; j < nrows; j++)
2187
+ { int count = 0;
2188
+ for (k = 0; k < ncolumns; k++)
2189
+ { if (i==clusterid[k] && mask[j][k])
2190
+ { cache[count] = data[j][k];
2191
+ count++;
2192
+ }
2193
+ }
2194
+ if (count>0)
2195
+ { cdata[j][i] = median(count,cache);
2196
+ cmask[j][i] = 1;
2197
+ }
2198
+ else
2199
+ { cdata[j][i] = 0.;
2200
+ cmask[j][i] = 0;
2201
+ }
2202
+ }
2203
+ }
2204
+ }
2205
+ }
2206
+
2207
+ /* ********************************************************************* */
2208
+
2209
+ int getclustercentroids(int nclusters, int nrows, int ncolumns,
2210
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
2211
+ int transpose, char method)
2212
+ /*
2213
+ Purpose
2214
+ =======
2215
+
2216
+ The getclustercentroids routine calculates the cluster centroids, given to
2217
+ which cluster each element belongs. Depending on the argument method, the
2218
+ centroid is defined as either the mean or the median for each dimension over
2219
+ all elements belonging to a cluster.
2220
+
2221
+ Arguments
2222
+ =========
2223
+
2224
+ nclusters (input) int
2225
+ The number of clusters.
2226
+
2227
+ nrows (input) int
2228
+ The number of rows in the gene expression data matrix, equal to the number of
2229
+ genes.
2230
+
2231
+ ncolumns (input) int
2232
+ The number of columns in the gene expression data matrix, equal to the number of
2233
+ microarrays.
2234
+
2235
+ data (input) double[nrows][ncolumns]
2236
+ The array containing the gene expression data.
2237
+
2238
+ mask (input) int[nrows][ncolumns]
2239
+ This array shows which data values are missing. If mask[i][j]==0, then
2240
+ data[i][j] is missing.
2241
+
2242
+ clusterid (output) int[nrows] if transpose==0
2243
+ int[ncolumns] if transpose==1
2244
+ The cluster number to which each element belongs. If transpose==0, then the
2245
+ dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2246
+ is equal to ncolumns (the number of microarrays).
2247
+
2248
+ cdata (output) double[nclusters][ncolumns] if transpose==0
2249
+ double[nrows][nclusters] if transpose==1
2250
+ On exit of getclustercentroids, this array contains the cluster centroids.
2251
+
2252
+ cmask (output) int[nclusters][ncolumns] if transpose==0
2253
+ int[nrows][nclusters] if transpose==1
2254
+ This array shows which data values of are missing for each centroid. If
2255
+ cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2256
+ a centroid if all corresponding data values of the cluster members are missing.
2257
+
2258
+ transpose (input) int
2259
+ If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2260
+ columns (microarrays) are specified.
2261
+
2262
+ method (input) char
2263
+ For method=='a', the centroid is defined as the mean over all elements
2264
+ belonging to a cluster for each dimension.
2265
+ For method=='m', the centroid is defined as the median over all elements
2266
+ belonging to a cluster for each dimension.
2267
+
2268
+ Return value
2269
+ ============
2270
+
2271
+ The function returns an integer to indicate success or failure. If a
2272
+ memory error occurs, or if method is not 'm' or 'a', getclustercentroids
2273
+ returns 0. If successful, getclustercentroids returns 1.
2274
+ ========================================================================
2275
+ */
2276
+ { switch(method)
2277
+ { case 'm':
2278
+ { const int nelements = (transpose==0) ? nrows : ncolumns;
2279
+ double* cache = malloc(nelements*sizeof(double));
2280
+ if (!cache) return 0;
2281
+ getclustermedians(nclusters, nrows, ncolumns, data, mask, clusterid,
2282
+ cdata, cmask, transpose, cache);
2283
+ free(cache);
2284
+ return 1;
2285
+ }
2286
+ case 'a':
2287
+ { getclustermeans(nclusters, nrows, ncolumns, data, mask, clusterid,
2288
+ cdata, cmask, transpose);
2289
+ return 1;
2290
+ }
2291
+ }
2292
+ return 0;
2293
+ }
2294
+
2295
+ /* ********************************************************************* */
2296
+
2297
+ void getclustermedoids(int nclusters, int nelements, double** distance,
2298
+ int clusterid[], int centroids[], double errors[])
2299
+ /*
2300
+ Purpose
2301
+ =======
2302
+
2303
+ The getclustermedoids routine calculates the cluster centroids, given to which
2304
+ cluster each element belongs. The centroid is defined as the element with the
2305
+ smallest sum of distances to the other elements.
2306
+
2307
+ Arguments
2308
+ =========
2309
+
2310
+ nclusters (input) int
2311
+ The number of clusters.
2312
+
2313
+ nelements (input) int
2314
+ The total number of elements.
2315
+
2316
+ distmatrix (input) double array, ragged
2317
+ (number of rows is nelements, number of columns is equal to the row number)
2318
+ The distance matrix. To save space, the distance matrix is given in the
2319
+ form of a ragged array. The distance matrix is symmetric and has zeros
2320
+ on the diagonal. See distancematrix for a description of the content.
2321
+
2322
+ clusterid (output) int[nelements]
2323
+ The cluster number to which each element belongs.
2324
+
2325
+ centroid (output) int[nclusters]
2326
+ The index of the element that functions as the centroid for each cluster.
2327
+
2328
+ errors (output) double[nclusters]
2329
+ The within-cluster sum of distances between the items and the cluster
2330
+ centroid.
2331
+
2332
+ ========================================================================
2333
+ */
2334
+ { int i, j, k;
2335
+ for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
2336
+ for (i = 0; i < nelements; i++)
2337
+ { double d = 0.0;
2338
+ j = clusterid[i];
2339
+ for (k = 0; k < nelements; k++)
2340
+ { if (i==k || clusterid[k]!=j) continue;
2341
+ d += (i < k ? distance[k][i] : distance[i][k]);
2342
+ if (d > errors[j]) break;
2343
+ }
2344
+ if (d < errors[j])
2345
+ { errors[j] = d;
2346
+ centroids[j] = i;
2347
+ }
2348
+ }
2349
+ }
2350
+
2351
+ /* ********************************************************************* */
2352
+
2353
+ static int
2354
+ kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2355
+ double weight[], int transpose, int npass, char dist,
2356
+ double** cdata, int** cmask, int clusterid[], double* error,
2357
+ int tclusterid[], int counts[], int mapping[])
2358
+ { int i, j, k;
2359
+ const int nelements = (transpose==0) ? nrows : ncolumns;
2360
+ const int ndata = (transpose==0) ? ncolumns : nrows;
2361
+ int ifound = 1;
2362
+ int ipass = 0;
2363
+ /* Set the metric function as indicated by dist */
2364
+ double (*metric)
2365
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
2366
+ setmetric(dist);
2367
+
2368
+ /* We save the clustering solution periodically and check if it reappears */
2369
+ int* saved = malloc(nelements*sizeof(int));
2370
+ if (saved==NULL) return -1;
2371
+
2372
+ *error = DBL_MAX;
2373
+
2374
+ do
2375
+ { double total = DBL_MAX;
2376
+ int counter = 0;
2377
+ int period = 10;
2378
+
2379
+ /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2380
+ if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2381
+
2382
+ for (i = 0; i < nclusters; i++) counts[i] = 0;
2383
+ for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2384
+
2385
+ /* Start the loop */
2386
+ while(1)
2387
+ { double previous = total;
2388
+ total = 0.0;
2389
+
2390
+ if (counter % period == 0) /* Save the current cluster assignments */
2391
+ { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2392
+ if (period < INT_MAX / 2) period *= 2;
2393
+ }
2394
+ counter++;
2395
+
2396
+ /* Find the center */
2397
+ getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
2398
+ cdata, cmask, transpose);
2399
+
2400
+ for (i = 0; i < nelements; i++)
2401
+ /* Calculate the distances */
2402
+ { double distance;
2403
+ k = tclusterid[i];
2404
+ if (counts[k]==1) continue;
2405
+ /* No reassignment if that would lead to an empty cluster */
2406
+ /* Treat the present cluster as a special case */
2407
+ distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
2408
+ for (j = 0; j < nclusters; j++)
2409
+ { double tdistance;
2410
+ if (j==k) continue;
2411
+ tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
2412
+ if (tdistance < distance)
2413
+ { distance = tdistance;
2414
+ counts[tclusterid[i]]--;
2415
+ tclusterid[i] = j;
2416
+ counts[j]++;
2417
+ }
2418
+ }
2419
+ total += distance;
2420
+ }
2421
+ if (total>=previous) break;
2422
+ /* total>=previous is FALSE on some machines even if total and previous
2423
+ * are bitwise identical. */
2424
+ for (i = 0; i < nelements; i++)
2425
+ if (saved[i]!=tclusterid[i]) break;
2426
+ if (i==nelements)
2427
+ break; /* Identical solution found; break out of this loop */
2428
+ }
2429
+
2430
+ if (npass<=1)
2431
+ { *error = total;
2432
+ break;
2433
+ }
2434
+
2435
+ for (i = 0; i < nclusters; i++) mapping[i] = -1;
2436
+ for (i = 0; i < nelements; i++)
2437
+ { j = tclusterid[i];
2438
+ k = clusterid[i];
2439
+ if (mapping[k] == -1) mapping[k] = j;
2440
+ else if (mapping[k] != j)
2441
+ { if (total < *error)
2442
+ { ifound = 1;
2443
+ *error = total;
2444
+ for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
2445
+ }
2446
+ break;
2447
+ }
2448
+ }
2449
+ if (i==nelements) ifound++; /* break statement not encountered */
2450
+ } while (++ipass < npass);
2451
+
2452
+ free(saved);
2453
+ return ifound;
2454
+ }
2455
+
2456
+ /* ---------------------------------------------------------------------- */
2457
+
2458
+ static int
2459
+ kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2460
+ double weight[], int transpose, int npass, char dist,
2461
+ double** cdata, int** cmask, int clusterid[], double* error,
2462
+ int tclusterid[], int counts[], int mapping[], double cache[])
2463
+ { int i, j, k;
2464
+ const int nelements = (transpose==0) ? nrows : ncolumns;
2465
+ const int ndata = (transpose==0) ? ncolumns : nrows;
2466
+ int ifound = 1;
2467
+ int ipass = 0;
2468
+ /* Set the metric function as indicated by dist */
2469
+ double (*metric)
2470
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
2471
+ setmetric(dist);
2472
+
2473
+ /* We save the clustering solution periodically and check if it reappears */
2474
+ int* saved = malloc(nelements*sizeof(int));
2475
+ if (saved==NULL) return -1;
2476
+
2477
+ *error = DBL_MAX;
2478
+
2479
+ do
2480
+ { double total = DBL_MAX;
2481
+ int counter = 0;
2482
+ int period = 10;
2483
+
2484
+ /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2485
+ if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2486
+
2487
+ for (i = 0; i < nclusters; i++) counts[i]=0;
2488
+ for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2489
+
2490
+ /* Start the loop */
2491
+ while(1)
2492
+ { double previous = total;
2493
+ total = 0.0;
2494
+
2495
+ if (counter % period == 0) /* Save the current cluster assignments */
2496
+ { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2497
+ if (period < INT_MAX / 2) period *= 2;
2498
+ }
2499
+ counter++;
2500
+
2501
+ /* Find the center */
2502
+ getclustermedians(nclusters, nrows, ncolumns, data, mask, tclusterid,
2503
+ cdata, cmask, transpose, cache);
2504
+
2505
+ for (i = 0; i < nelements; i++)
2506
+ /* Calculate the distances */
2507
+ { double distance;
2508
+ k = tclusterid[i];
2509
+ if (counts[k]==1) continue;
2510
+ /* No reassignment if that would lead to an empty cluster */
2511
+ /* Treat the present cluster as a special case */
2512
+ distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
2513
+ for (j = 0; j < nclusters; j++)
2514
+ { double tdistance;
2515
+ if (j==k) continue;
2516
+ tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
2517
+ if (tdistance < distance)
2518
+ { distance = tdistance;
2519
+ counts[tclusterid[i]]--;
2520
+ tclusterid[i] = j;
2521
+ counts[j]++;
2522
+ }
2523
+ }
2524
+ total += distance;
2525
+ }
2526
+ if (total>=previous) break;
2527
+ /* total>=previous is FALSE on some machines even if total and previous
2528
+ * are bitwise identical. */
2529
+ for (i = 0; i < nelements; i++)
2530
+ if (saved[i]!=tclusterid[i]) break;
2531
+ if (i==nelements)
2532
+ break; /* Identical solution found; break out of this loop */
2533
+ }
2534
+
2535
+ if (npass<=1)
2536
+ { *error = total;
2537
+ break;
2538
+ }
2539
+
2540
+ for (i = 0; i < nclusters; i++) mapping[i] = -1;
2541
+ for (i = 0; i < nelements; i++)
2542
+ { j = tclusterid[i];
2543
+ k = clusterid[i];
2544
+ if (mapping[k] == -1) mapping[k] = j;
2545
+ else if (mapping[k] != j)
2546
+ { if (total < *error)
2547
+ { ifound = 1;
2548
+ *error = total;
2549
+ for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
2550
+ }
2551
+ break;
2552
+ }
2553
+ }
2554
+ if (i==nelements) ifound++; /* break statement not encountered */
2555
+ } while (++ipass < npass);
2556
+
2557
+ free(saved);
2558
+ return ifound;
2559
+ }
2560
+
2561
+ /* ********************************************************************* */
2562
+
2563
+ void kcluster (int nclusters, int nrows, int ncolumns,
2564
+ double** data, int** mask, double weight[], int transpose,
2565
+ int npass, char method, char dist,
2566
+ int clusterid[], double* error, int* ifound)
2567
+ /*
2568
+ Purpose
2569
+ =======
2570
+
2571
+ The kcluster routine performs k-means or k-median clustering on a given set of
2572
+ elements, using the specified distance measure. The number of clusters is given
2573
+ by the user. Multiple passes are being made to find the optimal clustering
2574
+ solution, each time starting from a different initial clustering.
2575
+
2576
+
2577
+ Arguments
2578
+ =========
2579
+
2580
+ nclusters (input) int
2581
+ The number of clusters to be found.
2582
+
2583
+ data (input) double[nrows][ncolumns]
2584
+ The array containing the data of the elements to be clustered (i.e., the gene
2585
+ expression data).
2586
+
2587
+ mask (input) int[nrows][ncolumns]
2588
+ This array shows which data values are missing. If
2589
+ mask[i][j] == 0, then data[i][j] is missing.
2590
+
2591
+ nrows (input) int
2592
+ The number of rows in the data matrix, equal to the number of genes.
2593
+
2594
+ ncolumns (input) int
2595
+ The number of columns in the data matrix, equal to the number of microarrays.
2596
+
2597
+ weight (input) double[n]
2598
+ The weights that are used to calculate the distance.
2599
+
2600
+ transpose (input) int
2601
+ If transpose==0, the rows of the matrix are clustered. Otherwise, columns
2602
+ of the matrix are clustered.
2603
+
2604
+ npass (input) int
2605
+ The number of times clustering is performed. Clustering is performed npass
2606
+ times, each time starting from a different (random) initial assignment of
2607
+ genes to clusters. The clustering solution with the lowest within-cluster sum
2608
+ of distances is chosen.
2609
+ If npass==0, then the clustering algorithm will be run once, where the initial
2610
+ assignment of elements to clusters is taken from the clusterid array.
2611
+
2612
+ method (input) char
2613
+ Defines whether the arithmetic mean (method=='a') or the median
2614
+ (method=='m') is used to calculate the cluster center.
2615
+
2616
+ dist (input) char
2617
+ Defines which distance measure is used, as given by the table:
2618
+ dist=='e': Euclidean distance
2619
+ dist=='b': City-block distance
2620
+ dist=='c': correlation
2621
+ dist=='a': absolute value of the correlation
2622
+ dist=='u': uncentered correlation
2623
+ dist=='x': absolute uncentered correlation
2624
+ dist=='s': Spearman's rank correlation
2625
+ dist=='k': Kendall's tau
2626
+ For other values of dist, the default (Euclidean distance) is used.
2627
+
2628
+ clusterid (output; input) int[nrows] if transpose==0
2629
+ int[ncolumns] if transpose==1
2630
+ The cluster number to which a gene or microarray was assigned. If npass==0,
2631
+ then on input clusterid contains the initial clustering assignment from which
2632
+ the clustering algorithm starts. On output, it contains the clustering solution
2633
+ that was found.
2634
+
2635
+ error (output) double*
2636
+ The sum of distances to the cluster center of each item in the optimal k-means
2637
+ clustering solution that was found.
2638
+
2639
+ ifound (output) int*
2640
+ The number of times the optimal clustering solution was
2641
+ found. The value of ifound is at least 1; its maximum value is npass. If the
2642
+ number of clusters is larger than the number of elements being clustered,
2643
+ *ifound is set to 0 as an error code. If a memory allocation error occurs,
2644
+ *ifound is set to -1.
2645
+
2646
+ ========================================================================
2647
+ */
2648
+ { const int nelements = (transpose==0) ? nrows : ncolumns;
2649
+ const int ndata = (transpose==0) ? ncolumns : nrows;
2650
+
2651
+ int i;
2652
+ int ok;
2653
+ int* tclusterid;
2654
+ int* mapping = NULL;
2655
+ double** cdata;
2656
+ int** cmask;
2657
+ int* counts;
2658
+
2659
+ if (nelements < nclusters)
2660
+ { *ifound = 0;
2661
+ return;
2662
+ }
2663
+ /* More clusters asked for than elements available */
2664
+
2665
+ *ifound = -1;
2666
+
2667
+ /* This will contain the number of elements in each cluster, which is
2668
+ * needed to check for empty clusters. */
2669
+ counts = malloc(nclusters*sizeof(int));
2670
+ if(!counts) return;
2671
+
2672
+ /* Find out if the user specified an initial clustering */
2673
+ if (npass<=1) tclusterid = clusterid;
2674
+ else
2675
+ { tclusterid = malloc(nelements*sizeof(int));
2676
+ if (!tclusterid)
2677
+ { free(counts);
2678
+ return;
2679
+ }
2680
+ mapping = malloc(nclusters*sizeof(int));
2681
+ if (!mapping)
2682
+ { free(counts);
2683
+ free(tclusterid);
2684
+ return;
2685
+ }
2686
+ for (i = 0; i < nelements; i++) clusterid[i] = 0;
2687
+ }
2688
+
2689
+ /* Allocate space to store the centroid data */
2690
+ if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
2691
+ else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
2692
+ if(!ok)
2693
+ { free(counts);
2694
+ if(npass>1)
2695
+ { free(tclusterid);
2696
+ free(mapping);
2697
+ return;
2698
+ }
2699
+ }
2700
+
2701
+ if (method=='m')
2702
+ { double* cache = malloc(nelements*sizeof(double));
2703
+ if(cache)
2704
+ { *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
2705
+ transpose, npass, dist, cdata, cmask, clusterid, error,
2706
+ tclusterid, counts, mapping, cache);
2707
+ free(cache);
2708
+ }
2709
+ }
2710
+ else
2711
+ *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
2712
+ transpose, npass, dist, cdata, cmask, clusterid, error,
2713
+ tclusterid, counts, mapping);
2714
+
2715
+ /* Deallocate temporarily used space */
2716
+ if (npass > 1)
2717
+ { free(mapping);
2718
+ free(tclusterid);
2719
+ }
2720
+
2721
+ if (transpose==0) freedatamask(nclusters, cdata, cmask);
2722
+ else freedatamask(ndata, cdata, cmask);
2723
+
2724
+ free(counts);
2725
+ }
2726
+
2727
+ /* *********************************************************************** */
2728
+
2729
+ void kmedoids (int nclusters, int nelements, double** distmatrix,
2730
+ int npass, int clusterid[], double* error, int* ifound)
2731
+ /*
2732
+ Purpose
2733
+ =======
2734
+
2735
+ The kmedoids routine performs k-medoids clustering on a given set of elements,
2736
+ using the distance matrix and the number of clusters passed by the user.
2737
+ Multiple passes are being made to find the optimal clustering solution, each
2738
+ time starting from a different initial clustering.
2739
+
2740
+
2741
+ Arguments
2742
+ =========
2743
+
2744
+ nclusters (input) int
2745
+ The number of clusters to be found.
2746
+
2747
+ nelements (input) int
2748
+ The number of elements to be clustered.
2749
+
2750
+ distmatrix (input) double array, ragged
2751
+ (number of rows is nelements, number of columns is equal to the row number)
2752
+ The distance matrix. To save space, the distance matrix is given in the
2753
+ form of a ragged array. The distance matrix is symmetric and has zeros
2754
+ on the diagonal. See distancematrix for a description of the content.
2755
+
2756
+ npass (input) int
2757
+ The number of times clustering is performed. Clustering is performed npass
2758
+ times, each time starting from a different (random) initial assignment of genes
2759
+ to clusters. The clustering solution with the lowest within-cluster sum of
2760
+ distances is chosen.
2761
+ If npass==0, then the clustering algorithm will be run once, where the initial
2762
+ assignment of elements to clusters is taken from the clusterid array.
2763
+
2764
+ clusterid (output; input) int[nelements]
2765
+ On input, if npass==0, then clusterid contains the initial clustering assignment
2766
+ from which the clustering algorithm starts; all numbers in clusterid should be
2767
+ between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on
2768
+ input.
2769
+ On output, clusterid contains the clustering solution that was found: clusterid
2770
+ contains the number of the cluster to which each item was assigned. On output,
2771
+ the number of a cluster is defined as the item number of the centroid of the
2772
+ cluster.
2773
+
2774
+ error (output) double
2775
+ The sum of distances to the cluster center of each item in the optimal k-medoids
2776
+ clustering solution that was found.
2777
+
2778
+ ifound (output) int
2779
+ If kmedoids is successful: the number of times the optimal clustering solution
2780
+ was found. The value of ifound is at least 1; its maximum value is npass.
2781
+ If the user requested more clusters than elements available, ifound is set
2782
+ to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
2783
+
2784
+ ========================================================================
2785
+ */
2786
+ { int i, j, icluster;
2787
+ int* tclusterid;
2788
+ int* saved;
2789
+ int* centroids;
2790
+ double* errors;
2791
+ int ipass = 0;
2792
+
2793
+ if (nelements < nclusters)
2794
+ { *ifound = 0;
2795
+ return;
2796
+ } /* More clusters asked for than elements available */
2797
+
2798
+ *ifound = -1;
2799
+
2800
+ /* We save the clustering solution periodically and check if it reappears */
2801
+ saved = malloc(nelements*sizeof(int));
2802
+ if (saved==NULL) return;
2803
+
2804
+ centroids = malloc(nclusters*sizeof(int));
2805
+ if(!centroids)
2806
+ { free(saved);
2807
+ return;
2808
+ }
2809
+
2810
+ errors = malloc(nclusters*sizeof(double));
2811
+ if(!errors)
2812
+ { free(saved);
2813
+ free(centroids);
2814
+ return;
2815
+ }
2816
+
2817
+ /* Find out if the user specified an initial clustering */
2818
+ if (npass<=1) tclusterid = clusterid;
2819
+ else
2820
+ { tclusterid = malloc(nelements*sizeof(int));
2821
+ if(!tclusterid)
2822
+ { free(saved);
2823
+ free(centroids);
2824
+ free(errors);
2825
+ return;
2826
+ }
2827
+ }
2828
+
2829
+ *error = DBL_MAX;
2830
+ do /* Start the loop */
2831
+ { double total = DBL_MAX;
2832
+ int counter = 0;
2833
+ int period = 10;
2834
+
2835
+ if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2836
+ while(1)
2837
+ { double previous = total;
2838
+ total = 0.0;
2839
+
2840
+ if (counter % period == 0) /* Save the current cluster assignments */
2841
+ { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2842
+ if (period < INT_MAX / 2) period *= 2;
2843
+ }
2844
+ counter++;
2845
+
2846
+ /* Find the center */
2847
+ getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
2848
+ centroids, errors);
2849
+
2850
+ for (i = 0; i < nelements; i++)
2851
+ /* Find the closest cluster */
2852
+ { double distance = DBL_MAX;
2853
+ for (icluster = 0; icluster < nclusters; icluster++)
2854
+ { double tdistance;
2855
+ j = centroids[icluster];
2856
+ if (i==j)
2857
+ { distance = 0.0;
2858
+ tclusterid[i] = icluster;
2859
+ break;
2860
+ }
2861
+ tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
2862
+ if (tdistance < distance)
2863
+ { distance = tdistance;
2864
+ tclusterid[i] = icluster;
2865
+ }
2866
+ }
2867
+ total += distance;
2868
+ }
2869
+ if (total>=previous) break;
2870
+ /* total>=previous is FALSE on some machines even if total and previous
2871
+ * are bitwise identical. */
2872
+ for (i = 0; i < nelements; i++)
2873
+ if (saved[i]!=tclusterid[i]) break;
2874
+ if (i==nelements)
2875
+ break; /* Identical solution found; break out of this loop */
2876
+ }
2877
+
2878
+ for (i = 0; i < nelements; i++)
2879
+ { if (clusterid[i]!=centroids[tclusterid[i]])
2880
+ { if (total < *error)
2881
+ { *ifound = 1;
2882
+ *error = total;
2883
+ /* Replace by the centroid in each cluster. */
2884
+ for (j = 0; j < nelements; j++)
2885
+ clusterid[j] = centroids[tclusterid[j]];
2886
+ }
2887
+ break;
2888
+ }
2889
+ }
2890
+ if (i==nelements) (*ifound)++; /* break statement not encountered */
2891
+ } while (++ipass < npass);
2892
+
2893
+ /* Deallocate temporarily used space */
2894
+ if (npass > 1) free(tclusterid);
2895
+
2896
+ free(saved);
2897
+ free(centroids);
2898
+ free(errors);
2899
+
2900
+ return;
2901
+ }
2902
+
2903
+ /* ******************************************************************** */
2904
+
2905
+ double** distancematrix (int nrows, int ncolumns, double** data,
2906
+ int** mask, double weights[], char dist, int transpose)
2907
+ /*
2908
+ Purpose
2909
+ =======
2910
+
2911
+ The distancematrix routine calculates the distance matrix between genes or
2912
+ microarrays using their measured gene expression data. Several distance measures
2913
+ can be used. The routine returns a pointer to a ragged array containing the
2914
+ distances between the genes. As the distance matrix is symmetric, with zeros on
2915
+ the diagonal, only the lower triangular half of the distance matrix is saved.
2916
+ The distancematrix routine allocates space for the distance matrix. If the
2917
+ parameter transpose is set to a nonzero value, the distances between the columns
2918
+ (microarrays) are calculated, otherwise distances between the rows (genes) are
2919
+ calculated.
2920
+ If sufficient space in memory cannot be allocated to store the distance matrix,
2921
+ the routine returns a NULL pointer, and all memory allocated so far for the
2922
+ distance matrix is freed.
2923
+
2924
+
2925
+ Arguments
2926
+ =========
2927
+
2928
+ nrows (input) int
2929
+ The number of rows in the gene expression data matrix (i.e., the number of
2930
+ genes)
2931
+
2932
+ ncolumns (input) int
2933
+ The number of columns in the gene expression data matrix (i.e., the number of
2934
+ microarrays)
2935
+
2936
+ data (input) double[nrows][ncolumns]
2937
+ The array containing the gene expression data.
2938
+
2939
+ mask (input) int[nrows][ncolumns]
2940
+ This array shows which data values are missing. If mask[i][j]==0, then
2941
+ data[i][j] is missing.
2942
+
2943
+ weight (input) double[n]
2944
+ The weights that are used to calculate the distance. The length of this vector
2945
+ is equal to the number of columns if the distances between genes are calculated,
2946
+ or the number of rows if the distances between microarrays are calculated.
2947
+
2948
+ dist (input) char
2949
+ Defines which distance measure is used, as given by the table:
2950
+ dist=='e': Euclidean distance
2951
+ dist=='b': City-block distance
2952
+ dist=='c': correlation
2953
+ dist=='a': absolute value of the correlation
2954
+ dist=='u': uncentered correlation
2955
+ dist=='x': absolute uncentered correlation
2956
+ dist=='s': Spearman's rank correlation
2957
+ dist=='k': Kendall's tau
2958
+ For other values of dist, the default (Euclidean distance) is used.
2959
+
2960
+ transpose (input) int
2961
+ If transpose is equal to zero, the distances between the rows is
2962
+ calculated. Otherwise, the distances between the columns is calculated.
2963
+ The former is needed when genes are being clustered; the latter is used
2964
+ when microarrays are being clustered.
2965
+
2966
+ ========================================================================
2967
+ */
2968
+ { /* First determine the size of the distance matrix */
2969
+ const int n = (transpose==0) ? nrows : ncolumns;
2970
+ const int ndata = (transpose==0) ? ncolumns : nrows;
2971
+ int i,j;
2972
+ double** matrix;
2973
+
2974
+ /* Set the metric function as indicated by dist */
2975
+ double (*metric)
2976
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
2977
+ setmetric(dist);
2978
+
2979
+ if (n < 2) return NULL;
2980
+
2981
+ /* Set up the ragged array */
2982
+ matrix = malloc(n*sizeof(double*));
2983
+ if(matrix==NULL) return NULL; /* Not enough memory available */
2984
+ matrix[0] = NULL;
2985
+ /* The zeroth row has zero columns. We allocate it anyway for convenience.*/
2986
+ for (i = 1; i < n; i++)
2987
+ { matrix[i] = malloc(i*sizeof(double));
2988
+ if (matrix[i]==NULL) break; /* Not enough memory available */
2989
+ }
2990
+ if (i < n) /* break condition encountered */
2991
+ { j = i;
2992
+ for (i = 1; i < j; i++) free(matrix[i]);
2993
+ return NULL;
2994
+ }
2995
+
2996
+ /* Calculate the distances and save them in the ragged array */
2997
+ for (i = 1; i < n; i++)
2998
+ for (j = 0; j < i; j++)
2999
+ matrix[i][j]=metric(ndata,data,data,mask,mask,weights,i,j,transpose);
3000
+
3001
+ return matrix;
3002
+ }
3003
+
3004
+ /* ******************************************************************** */
3005
+
3006
+ double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
3007
+ double weights[], int transpose, char dist, double cutoff, double exponent)
3008
+
3009
+ /*
3010
+ Purpose
3011
+ =======
3012
+
3013
+ This function calculates the weights using the weighting scheme proposed by
3014
+ Michael Eisen:
3015
+ w[i] = 1.0 / sum_{j where d[i][j]<cutoff} (1 - d[i][j]/cutoff)^exponent
3016
+ where the cutoff and the exponent are specified by the user.
3017
+
3018
+
3019
+ Arguments
3020
+ =========
3021
+
3022
+ nrows (input) int
3023
+ The number of rows in the gene expression data matrix, equal to the number of
3024
+ genes.
3025
+
3026
+ ncolumns (input) int
3027
+ The number of columns in the gene expression data matrix, equal to the number of
3028
+ microarrays.
3029
+
3030
+ data (input) double[nrows][ncolumns]
3031
+ The array containing the gene expression data.
3032
+
3033
+ mask (input) int[nrows][ncolumns]
3034
+ This array shows which data values are missing. If mask[i][j]==0, then
3035
+ data[i][j] is missing.
3036
+
3037
+ weight (input) int[ncolumns] if transpose==0,
3038
+ int[nrows] if transpose==1
3039
+ The weights that are used to calculate the distance. The length of this vector
3040
+ is ncolumns if gene weights are being clustered, and nrows if microarrays
3041
+ weights are being clustered.
3042
+
3043
+ transpose (input) int
3044
+ If transpose==0, the weights of the rows of the data matrix are calculated.
3045
+ Otherwise, the weights of the columns of the data matrix are calculated.
3046
+
3047
+ dist (input) char
3048
+ Defines which distance measure is used, as given by the table:
3049
+ dist=='e': Euclidean distance
3050
+ dist=='b': City-block distance
3051
+ dist=='c': correlation
3052
+ dist=='a': absolute value of the correlation
3053
+ dist=='u': uncentered correlation
3054
+ dist=='x': absolute uncentered correlation
3055
+ dist=='s': Spearman's rank correlation
3056
+ dist=='k': Kendall's tau
3057
+ For other values of dist, the default (Euclidean distance) is used.
3058
+
3059
+ cutoff (input) double
3060
+ The cutoff to be used to calculate the weights.
3061
+
3062
+ exponent (input) double
3063
+ The exponent to be used to calculate the weights.
3064
+
3065
+
3066
+ Return value
3067
+ ============
3068
+
3069
+ The function returns a pointer to a newly allocated array containing the
3070
+ calculated weights for the rows (if transpose==0) or columns (if
3071
+ transpose==1). If not enough memory could be allocated to store the
3072
+ weights array, the function returns NULL.
3073
+
3074
+ ========================================================================
3075
+ */
3076
+ { int i,j;
3077
+ const int ndata = (transpose==0) ? ncolumns : nrows;
3078
+ const int nelements = (transpose==0) ? nrows : ncolumns;
3079
+
3080
+ /* Set the metric function as indicated by dist */
3081
+ double (*metric)
3082
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
3083
+ setmetric(dist);
3084
+
3085
+ double* result = malloc(nelements*sizeof(double));
3086
+ if (!result) return NULL;
3087
+ memset(result, 0, nelements*sizeof(double));
3088
+
3089
+ for (i = 0; i < nelements; i++)
3090
+ { result[i] += 1.0;
3091
+ for (j = 0; j < i; j++)
3092
+ { const double distance = metric(ndata, data, data, mask, mask, weights,
3093
+ i, j, transpose);
3094
+ if (distance < cutoff)
3095
+ { const double dweight = exp(exponent*log(1-distance/cutoff));
3096
+ /* pow() causes a crash on AIX */
3097
+ result[i] += dweight;
3098
+ result[j] += dweight;
3099
+ }
3100
+ }
3101
+ }
3102
+ for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
3103
+ return result;
3104
+ }
3105
+
3106
+ /* ******************************************************************** */
3107
+
3108
+ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
3109
+
3110
+ /*
3111
+ Purpose
3112
+ =======
3113
+
3114
+ The cuttree routine takes the output of a hierarchical clustering routine, and
3115
+ divides the elements in the tree structure into clusters based on the
3116
+ hierarchical clustering result. The number of clusters is specified by the user.
3117
+
3118
+ Arguments
3119
+ =========
3120
+
3121
+ nelements (input) int
3122
+ The number of elements that were clustered.
3123
+
3124
+ tree (input) Node[nelements-1]
3125
+ The clustering solution. Each node in the array describes one linking event,
3126
+ with tree[i].left and tree[i].right representig the elements that were joined.
3127
+ The original elements are numbered 0..nelements-1, nodes are numbered
3128
+ -1..-(nelements-1).
3129
+
3130
+ nclusters (input) int
3131
+ The number of clusters to be formed.
3132
+
3133
+ clusterid (output) int[nelements]
3134
+ The number of the cluster to which each element was assigned. Space for this
3135
+ array should be allocated before calling the cuttree routine. If a memory
3136
+ error occured, all elements in clusterid are set to -1.
3137
+
3138
+ ========================================================================
3139
+ */
3140
+ { int i, j, k;
3141
+ int icluster = 0;
3142
+ const int n = nelements-nclusters; /* number of nodes to join */
3143
+ int* nodeid;
3144
+ for (i = nelements-2; i >= n; i--)
3145
+ { k = tree[i].left;
3146
+ if (k>=0)
3147
+ { clusterid[k] = icluster;
3148
+ icluster++;
3149
+ }
3150
+ k = tree[i].right;
3151
+ if (k>=0)
3152
+ { clusterid[k] = icluster;
3153
+ icluster++;
3154
+ }
3155
+ }
3156
+ nodeid = malloc(n*sizeof(int));
3157
+ if(!nodeid)
3158
+ { for (i = 0; i < nelements; i++) clusterid[i] = -1;
3159
+ return;
3160
+ }
3161
+ for (i = 0; i < n; i++) nodeid[i] = -1;
3162
+ for (i = n-1; i >= 0; i--)
3163
+ { if(nodeid[i]<0)
3164
+ { j = icluster;
3165
+ nodeid[i] = j;
3166
+ icluster++;
3167
+ }
3168
+ else j = nodeid[i];
3169
+ k = tree[i].left;
3170
+ if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
3171
+ k = tree[i].right;
3172
+ if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
3173
+ }
3174
+ free(nodeid);
3175
+ return;
3176
+ }
3177
+
3178
+ /* ******************************************************************** */
3179
+
3180
+ static
3181
+ Node* pclcluster (int nrows, int ncolumns, double** data, int** mask,
3182
+ double weight[], double** distmatrix, char dist, int transpose)
3183
+
3184
+ /*
3185
+
3186
+ Purpose
3187
+ =======
3188
+
3189
+ The pclcluster routine performs clustering using pairwise centroid-linking
3190
+ on a given set of gene expression data, using the distance metric given by dist.
3191
+
3192
+ Arguments
3193
+ =========
3194
+
3195
+ nrows (input) int
3196
+ The number of rows in the gene expression data matrix, equal to the number of
3197
+ genes.
3198
+
3199
+ ncolumns (input) int
3200
+ The number of columns in the gene expression data matrix, equal to the number of
3201
+ microarrays.
3202
+
3203
+ data (input) double[nrows][ncolumns]
3204
+ The array containing the gene expression data.
3205
+
3206
+ mask (input) int[nrows][ncolumns]
3207
+ This array shows which data values are missing. If
3208
+ mask[i][j] == 0, then data[i][j] is missing.
3209
+
3210
+ weight (input) double[ncolumns] if transpose==0;
3211
+ double[nrows] if transpose==1
3212
+ The weights that are used to calculate the distance. The length of this vector
3213
+ is ncolumns if genes are being clustered, and nrows if microarrays are being
3214
+ clustered.
3215
+
3216
+ transpose (input) int
3217
+ If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3218
+ of the matrix are clustered.
3219
+
3220
+ dist (input) char
3221
+ Defines which distance measure is used, as given by the table:
3222
+ dist=='e': Euclidean distance
3223
+ dist=='b': City-block distance
3224
+ dist=='c': correlation
3225
+ dist=='a': absolute value of the correlation
3226
+ dist=='u': uncentered correlation
3227
+ dist=='x': absolute uncentered correlation
3228
+ dist=='s': Spearman's rank correlation
3229
+ dist=='k': Kendall's tau
3230
+ For other values of dist, the default (Euclidean distance) is used.
3231
+
3232
+ distmatrix (input) double**
3233
+ The distance matrix. This matrix is precalculated by the calling routine
3234
+ treecluster. The pclcluster routine modifies the contents of distmatrix, but
3235
+ does not deallocate it.
3236
+
3237
+ Return value
3238
+ ============
3239
+
3240
+ A pointer to a newly allocated array of Node structs, describing the
3241
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3242
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3243
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3244
+ structure.
3245
+ If a memory error occurs, pclcluster returns NULL.
3246
+ ========================================================================
3247
+ */
3248
+ { int i, j;
3249
+ const int nelements = (transpose==0) ? nrows : ncolumns;
3250
+ int inode;
3251
+ const int ndata = transpose ? nrows : ncolumns;
3252
+ const int nnodes = nelements - 1;
3253
+
3254
+ /* Set the metric function as indicated by dist */
3255
+ double (*metric)
3256
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
3257
+ setmetric(dist);
3258
+
3259
+ Node* result;
3260
+ double** newdata;
3261
+ int** newmask;
3262
+ int* distid = malloc(nelements*sizeof(int));
3263
+ if(!distid) return NULL;
3264
+ result = malloc(nnodes*sizeof(Node));
3265
+ if(!result)
3266
+ { free(distid);
3267
+ return NULL;
3268
+ }
3269
+ if(!makedatamask(nelements, ndata, &newdata, &newmask))
3270
+ { free(result);
3271
+ free(distid);
3272
+ return NULL;
3273
+ }
3274
+
3275
+ for (i = 0; i < nelements; i++) distid[i] = i;
3276
+ /* To remember which row/column in the distance matrix contains what */
3277
+
3278
+ /* Storage for node data */
3279
+ if (transpose)
3280
+ { for (i = 0; i < nelements; i++)
3281
+ { for (j = 0; j < ndata; j++)
3282
+ { newdata[i][j] = data[j][i];
3283
+ newmask[i][j] = mask[j][i];
3284
+ }
3285
+ }
3286
+ data = newdata;
3287
+ mask = newmask;
3288
+ }
3289
+ else
3290
+ { for (i = 0; i < nelements; i++)
3291
+ { memcpy(newdata[i], data[i], ndata*sizeof(double));
3292
+ memcpy(newmask[i], mask[i], ndata*sizeof(int));
3293
+ }
3294
+ data = newdata;
3295
+ mask = newmask;
3296
+ }
3297
+
3298
+ for (inode = 0; inode < nnodes; inode++)
3299
+ { /* Find the pair with the shortest distance */
3300
+ int is = 1;
3301
+ int js = 0;
3302
+ result[inode].distance = find_closest_pair(nelements-inode, distmatrix, &is, &js);
3303
+ result[inode].left = distid[js];
3304
+ result[inode].right = distid[is];
3305
+
3306
+ /* Make node js the new node */
3307
+ for (i = 0; i < ndata; i++)
3308
+ { data[js][i] = data[js][i]*mask[js][i] + data[is][i]*mask[is][i];
3309
+ mask[js][i] += mask[is][i];
3310
+ if (mask[js][i]) data[js][i] /= mask[js][i];
3311
+ }
3312
+ free(data[is]);
3313
+ free(mask[is]);
3314
+ data[is] = data[nnodes-inode];
3315
+ mask[is] = mask[nnodes-inode];
3316
+
3317
+ /* Fix the distances */
3318
+ distid[is] = distid[nnodes-inode];
3319
+ for (i = 0; i < is; i++)
3320
+ distmatrix[is][i] = distmatrix[nnodes-inode][i];
3321
+ for (i = is + 1; i < nnodes-inode; i++)
3322
+ distmatrix[i][is] = distmatrix[nnodes-inode][i];
3323
+
3324
+ distid[js] = -inode-1;
3325
+ for (i = 0; i < js; i++)
3326
+ distmatrix[js][i] = metric(ndata,data,data,mask,mask,weight,js,i,0);
3327
+ for (i = js + 1; i < nnodes-inode; i++)
3328
+ distmatrix[i][js] = metric(ndata,data,data,mask,mask,weight,js,i,0);
3329
+ }
3330
+
3331
+ /* Free temporarily allocated space */
3332
+ free(data[0]);
3333
+ free(mask[0]);
3334
+ free(data);
3335
+ free(mask);
3336
+ free(distid);
3337
+
3338
+ return result;
3339
+ }
3340
+
3341
+ /* ******************************************************************** */
3342
+
3343
+ static
3344
+ int nodecompare(const void* a, const void* b)
3345
+ /* Helper function for qsort. */
3346
+ { const Node* node1 = (const Node*)a;
3347
+ const Node* node2 = (const Node*)b;
3348
+ const double term1 = node1->distance;
3349
+ const double term2 = node2->distance;
3350
+ if (term1 < term2) return -1;
3351
+ if (term1 > term2) return +1;
3352
+ return 0;
3353
+ }
3354
+
3355
+ /* ---------------------------------------------------------------------- */
3356
+
3357
+ static
3358
+ Node* pslcluster (int nrows, int ncolumns, double** data, int** mask,
3359
+ double weight[], double** distmatrix, char dist, int transpose)
3360
+
3361
+ /*
3362
+
3363
+ Purpose
3364
+ =======
3365
+
3366
+ The pslcluster routine performs single-linkage hierarchical clustering, using
3367
+ either the distance matrix directly, if available, or by calculating the
3368
+ distances from the data array. This implementation is based on the SLINK
3369
+ algorithm, described in:
3370
+ Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
3371
+ cluster method. The Computer Journal, 16(1): 30-34.
3372
+ The output of this algorithm is identical to conventional single-linkage
3373
+ hierarchical clustering, but is much more memory-efficient and faster. Hence,
3374
+ it can be applied to large data sets, for which the conventional single-
3375
+ linkage algorithm fails due to lack of memory.
3376
+
3377
+
3378
+ Arguments
3379
+ =========
3380
+
3381
+ nrows (input) int
3382
+ The number of rows in the gene expression data matrix, equal to the number of
3383
+ genes.
3384
+
3385
+ ncolumns (input) int
3386
+ The number of columns in the gene expression data matrix, equal to the number of
3387
+ microarrays.
3388
+
3389
+ data (input) double[nrows][ncolumns]
3390
+ The array containing the gene expression data.
3391
+
3392
+ mask (input) int[nrows][ncolumns]
3393
+ This array shows which data values are missing. If
3394
+ mask[i][j] == 0, then data[i][j] is missing.
3395
+
3396
+ weight (input) double[n]
3397
+ The weights that are used to calculate the distance. The length of this vector
3398
+ is ncolumns if genes are being clustered, and nrows if microarrays are being
3399
+ clustered.
3400
+
3401
+ transpose (input) int
3402
+ If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3403
+ of the matrix are clustered.
3404
+
3405
+ dist (input) char
3406
+ Defines which distance measure is used, as given by the table:
3407
+ dist=='e': Euclidean distance
3408
+ dist=='b': City-block distance
3409
+ dist=='c': correlation
3410
+ dist=='a': absolute value of the correlation
3411
+ dist=='u': uncentered correlation
3412
+ dist=='x': absolute uncentered correlation
3413
+ dist=='s': Spearman's rank correlation
3414
+ dist=='k': Kendall's tau
3415
+ For other values of dist, the default (Euclidean distance) is used.
3416
+
3417
+ distmatrix (input) double**
3418
+ The distance matrix. If the distance matrix is passed by the calling routine
3419
+ treecluster, it is used by pslcluster to speed up the clustering calculation.
3420
+ The pslcluster routine does not modify the contents of distmatrix, and does
3421
+ not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
3422
+ by the pslcluster routine from the gene expression data (the data and mask
3423
+ arrays) and stored in temporary arrays. If distmatrix is passed, the original
3424
+ gene expression data (specified by the data and mask arguments) are not needed
3425
+ and are therefore ignored.
3426
+
3427
+
3428
+ Return value
3429
+ ============
3430
+
3431
+ A pointer to a newly allocated array of Node structs, describing the
3432
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3433
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3434
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3435
+ structure.
3436
+ If a memory error occurs, pslcluster returns NULL.
3437
+
3438
+ ========================================================================
3439
+ */
3440
+ { int i, j, k;
3441
+ const int nelements = transpose ? ncolumns : nrows;
3442
+ const int nnodes = nelements - 1;
3443
+ int* vector;
3444
+ double* temp;
3445
+ int* index;
3446
+ Node* result;
3447
+ temp = malloc(nnodes*sizeof(double));
3448
+ if(!temp) return NULL;
3449
+ index = malloc(nelements*sizeof(int));
3450
+ if(!index)
3451
+ { free(temp);
3452
+ return NULL;
3453
+ }
3454
+ vector = malloc(nnodes*sizeof(int));
3455
+ if(!vector)
3456
+ { free(index);
3457
+ free(temp);
3458
+ return NULL;
3459
+ }
3460
+ result = malloc(nelements*sizeof(Node));
3461
+ if(!result)
3462
+ { free(vector);
3463
+ free(index);
3464
+ free(temp);
3465
+ return NULL;
3466
+ }
3467
+
3468
+ for (i = 0; i < nnodes; i++) vector[i] = i;
3469
+
3470
+ if(distmatrix)
3471
+ { for (i = 0; i < nrows; i++)
3472
+ { result[i].distance = DBL_MAX;
3473
+ for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
3474
+ for (j = 0; j < i; j++)
3475
+ { k = vector[j];
3476
+ if (result[j].distance >= temp[j])
3477
+ { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3478
+ result[j].distance = temp[j];
3479
+ vector[j] = i;
3480
+ }
3481
+ else if (temp[j] < temp[k]) temp[k] = temp[j];
3482
+ }
3483
+ for (j = 0; j < i; j++)
3484
+ {
3485
+ if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3486
+ }
3487
+ }
3488
+ }
3489
+ else
3490
+ { const int ndata = transpose ? nrows : ncolumns;
3491
+ /* Set the metric function as indicated by dist */
3492
+ double (*metric)
3493
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
3494
+ setmetric(dist);
3495
+
3496
+ for (i = 0; i < nelements; i++)
3497
+ { result[i].distance = DBL_MAX;
3498
+ for (j = 0; j < i; j++) temp[j] =
3499
+ metric(ndata, data, data, mask, mask, weight, i, j, transpose);
3500
+ for (j = 0; j < i; j++)
3501
+ { k = vector[j];
3502
+ if (result[j].distance >= temp[j])
3503
+ { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3504
+ result[j].distance = temp[j];
3505
+ vector[j] = i;
3506
+ }
3507
+ else if (temp[j] < temp[k]) temp[k] = temp[j];
3508
+ }
3509
+ for (j = 0; j < i; j++)
3510
+ if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3511
+ }
3512
+ }
3513
+ free(temp);
3514
+
3515
+ for (i = 0; i < nnodes; i++) result[i].left = i;
3516
+ qsort(result, nnodes, sizeof(Node), nodecompare);
3517
+
3518
+ for (i = 0; i < nelements; i++) index[i] = i;
3519
+ for (i = 0; i < nnodes; i++)
3520
+ { j = result[i].left;
3521
+ k = vector[j];
3522
+ result[i].left = index[j];
3523
+ result[i].right = index[k];
3524
+ index[k] = -i-1;
3525
+ }
3526
+ free(vector);
3527
+ free(index);
3528
+
3529
+ result = realloc(result, nnodes*sizeof(Node));
3530
+
3531
+ return result;
3532
+ }
3533
+ /* ******************************************************************** */
3534
+
3535
+ static Node* pmlcluster (int nelements, double** distmatrix)
3536
+ /*
3537
+
3538
+ Purpose
3539
+ =======
3540
+
3541
+ The pmlcluster routine performs clustering using pairwise maximum- (complete-)
3542
+ linking on the given distance matrix.
3543
+
3544
+ Arguments
3545
+ =========
3546
+
3547
+ nelements (input) int
3548
+ The number of elements to be clustered.
3549
+
3550
+ distmatrix (input) double**
3551
+ The distance matrix, with nelements rows, each row being filled up to the
3552
+ diagonal. The elements on the diagonal are not used, as they are assumed to be
3553
+ zero. The distance matrix will be modified by this routine.
3554
+
3555
+ Return value
3556
+ ============
3557
+
3558
+ A pointer to a newly allocated array of Node structs, describing the
3559
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3560
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3561
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3562
+ structure.
3563
+ If a memory error occurs, pmlcluster returns NULL.
3564
+ ========================================================================
3565
+ */
3566
+ { int j;
3567
+ int n;
3568
+ int* clusterid;
3569
+ Node* result;
3570
+
3571
+ clusterid = malloc(nelements*sizeof(int));
3572
+ if(!clusterid) return NULL;
3573
+ result = malloc((nelements-1)*sizeof(Node));
3574
+ if (!result)
3575
+ { free(clusterid);
3576
+ return NULL;
3577
+ }
3578
+
3579
+ /* Setup a list specifying to which cluster a gene belongs */
3580
+ for (j = 0; j < nelements; j++) clusterid[j] = j;
3581
+
3582
+ for (n = nelements; n > 1; n--)
3583
+ { int is = 1;
3584
+ int js = 0;
3585
+ result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3586
+
3587
+ /* Fix the distances */
3588
+ for (j = 0; j < js; j++)
3589
+ distmatrix[js][j] = max(distmatrix[is][j],distmatrix[js][j]);
3590
+ for (j = js+1; j < is; j++)
3591
+ distmatrix[j][js] = max(distmatrix[is][j],distmatrix[j][js]);
3592
+ for (j = is+1; j < n; j++)
3593
+ distmatrix[j][js] = max(distmatrix[j][is],distmatrix[j][js]);
3594
+
3595
+ for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3596
+ for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3597
+
3598
+ /* Update clusterids */
3599
+ result[nelements-n].left = clusterid[is];
3600
+ result[nelements-n].right = clusterid[js];
3601
+ clusterid[js] = n-nelements-1;
3602
+ clusterid[is] = clusterid[n-1];
3603
+ }
3604
+ free(clusterid);
3605
+
3606
+ return result;
3607
+ }
3608
+
3609
+ /* ******************************************************************* */
3610
+
3611
+ static Node* palcluster (int nelements, double** distmatrix)
3612
+ /*
3613
+ Purpose
3614
+ =======
3615
+
3616
+ The palcluster routine performs clustering using pairwise average
3617
+ linking on the given distance matrix.
3618
+
3619
+ Arguments
3620
+ =========
3621
+
3622
+ nelements (input) int
3623
+ The number of elements to be clustered.
3624
+
3625
+ distmatrix (input) double**
3626
+ The distance matrix, with nelements rows, each row being filled up to the
3627
+ diagonal. The elements on the diagonal are not used, as they are assumed to be
3628
+ zero. The distance matrix will be modified by this routine.
3629
+
3630
+ Return value
3631
+ ============
3632
+
3633
+ A pointer to a newly allocated array of Node structs, describing the
3634
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3635
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3636
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3637
+ structure.
3638
+ If a memory error occurs, palcluster returns NULL.
3639
+ ========================================================================
3640
+ */
3641
+ { int j;
3642
+ int n;
3643
+ int* clusterid;
3644
+ int* number;
3645
+ Node* result;
3646
+
3647
+ clusterid = malloc(nelements*sizeof(int));
3648
+ if(!clusterid) return NULL;
3649
+ number = malloc(nelements*sizeof(int));
3650
+ if(!number)
3651
+ { free(clusterid);
3652
+ return NULL;
3653
+ }
3654
+ result = malloc((nelements-1)*sizeof(Node));
3655
+ if (!result)
3656
+ { free(clusterid);
3657
+ free(number);
3658
+ return NULL;
3659
+ }
3660
+
3661
+ /* Setup a list specifying to which cluster a gene belongs, and keep track
3662
+ * of the number of elements in each cluster (needed to calculate the
3663
+ * average). */
3664
+ for (j = 0; j < nelements; j++)
3665
+ { number[j] = 1;
3666
+ clusterid[j] = j;
3667
+ }
3668
+
3669
+ for (n = nelements; n > 1; n--)
3670
+ { int sum;
3671
+ int is = 1;
3672
+ int js = 0;
3673
+ result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3674
+
3675
+ /* Save result */
3676
+ result[nelements-n].left = clusterid[is];
3677
+ result[nelements-n].right = clusterid[js];
3678
+
3679
+ /* Fix the distances */
3680
+ sum = number[is] + number[js];
3681
+ for (j = 0; j < js; j++)
3682
+ { distmatrix[js][j] = distmatrix[is][j]*number[is]
3683
+ + distmatrix[js][j]*number[js];
3684
+ distmatrix[js][j] /= sum;
3685
+ }
3686
+ for (j = js+1; j < is; j++)
3687
+ { distmatrix[j][js] = distmatrix[is][j]*number[is]
3688
+ + distmatrix[j][js]*number[js];
3689
+ distmatrix[j][js] /= sum;
3690
+ }
3691
+ for (j = is+1; j < n; j++)
3692
+ { distmatrix[j][js] = distmatrix[j][is]*number[is]
3693
+ + distmatrix[j][js]*number[js];
3694
+ distmatrix[j][js] /= sum;
3695
+ }
3696
+
3697
+ for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3698
+ for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3699
+
3700
+ /* Update number of elements in the clusters */
3701
+ number[js] = sum;
3702
+ number[is] = number[n-1];
3703
+
3704
+ /* Update clusterids */
3705
+ clusterid[js] = n-nelements-1;
3706
+ clusterid[is] = clusterid[n-1];
3707
+ }
3708
+ free(clusterid);
3709
+ free(number);
3710
+
3711
+ return result;
3712
+ }
3713
+
3714
+ /* ******************************************************************* */
3715
+
3716
+ Node* treecluster (int nrows, int ncolumns, double** data, int** mask,
3717
+ double weight[], int transpose, char dist, char method, double** distmatrix)
3718
+ /*
3719
+ Purpose
3720
+ =======
3721
+
3722
+ The treecluster routine performs hierarchical clustering using pairwise
3723
+ single-, maximum-, centroid-, or average-linkage, as defined by method, on a
3724
+ given set of gene expression data, using the distance metric given by dist.
3725
+ If successful, the function returns a pointer to a newly allocated Tree struct
3726
+ containing the hierarchical clustering solution, and NULL if a memory error
3727
+ occurs. The pointer should be freed by the calling routine to prevent memory
3728
+ leaks.
3729
+
3730
+ Arguments
3731
+ =========
3732
+
3733
+ nrows (input) int
3734
+ The number of rows in the data matrix, equal to the number of genes.
3735
+
3736
+ ncolumns (input) int
3737
+ The number of columns in the data matrix, equal to the number of microarrays.
3738
+
3739
+ data (input) double[nrows][ncolumns]
3740
+ The array containing the data of the vectors to be clustered.
3741
+
3742
+ mask (input) int[nrows][ncolumns]
3743
+ This array shows which data values are missing. If mask[i][j]==0, then
3744
+ data[i][j] is missing.
3745
+
3746
+ weight (input) double array[n]
3747
+ The weights that are used to calculate the distance.
3748
+
3749
+ transpose (input) int
3750
+ If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3751
+ of the matrix are clustered.
3752
+
3753
+ dist (input) char
3754
+ Defines which distance measure is used, as given by the table:
3755
+ dist=='e': Euclidean distance
3756
+ dist=='b': City-block distance
3757
+ dist=='c': correlation
3758
+ dist=='a': absolute value of the correlation
3759
+ dist=='u': uncentered correlation
3760
+ dist=='x': absolute uncentered correlation
3761
+ dist=='s': Spearman's rank correlation
3762
+ dist=='k': Kendall's tau
3763
+ For other values of dist, the default (Euclidean distance) is used.
3764
+
3765
+ method (input) char
3766
+ Defines which hierarchical clustering method is used:
3767
+ method=='s': pairwise single-linkage clustering
3768
+ method=='m': pairwise maximum- (or complete-) linkage clustering
3769
+ method=='a': pairwise average-linkage clustering
3770
+ method=='c': pairwise centroid-linkage clustering
3771
+ For the first three, either the distance matrix or the gene expression data is
3772
+ sufficient to perform the clustering algorithm. For pairwise centroid-linkage
3773
+ clustering, however, the gene expression data are always needed, even if the
3774
+ distance matrix itself is available.
3775
+
3776
+ distmatrix (input) double**
3777
+ The distance matrix. If the distance matrix is zero initially, the distance
3778
+ matrix will be allocated and calculated from the data by treecluster, and
3779
+ deallocated before treecluster returns. If the distance matrix is passed by the
3780
+ calling routine, treecluster will modify the contents of the distance matrix as
3781
+ part of the clustering algorithm, but will not deallocate it. The calling
3782
+ routine should deallocate the distance matrix after the return from treecluster.
3783
+
3784
+ Return value
3785
+ ============
3786
+
3787
+ A pointer to a newly allocated array of Node structs, describing the
3788
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3789
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3790
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3791
+ structure.
3792
+ If a memory error occurs, treecluster returns NULL.
3793
+
3794
+ ========================================================================
3795
+ */
3796
+ { Node* result = NULL;
3797
+ const int nelements = (transpose==0) ? nrows : ncolumns;
3798
+ const int ldistmatrix = (distmatrix==NULL && method!='s') ? 1 : 0;
3799
+
3800
+ if (nelements < 2) return NULL;
3801
+
3802
+ /* Calculate the distance matrix if the user didn't give it */
3803
+ if(ldistmatrix)
3804
+ { distmatrix =
3805
+ distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose);
3806
+ if (!distmatrix) return NULL; /* Insufficient memory */
3807
+ }
3808
+
3809
+ switch(method)
3810
+ { case 's':
3811
+ result = pslcluster(nrows, ncolumns, data, mask, weight, distmatrix,
3812
+ dist, transpose);
3813
+ break;
3814
+ case 'm':
3815
+ result = pmlcluster(nelements, distmatrix);
3816
+ break;
3817
+ case 'a':
3818
+ result = palcluster(nelements, distmatrix);
3819
+ break;
3820
+ case 'c':
3821
+ result = pclcluster(nrows, ncolumns, data, mask, weight, distmatrix,
3822
+ dist, transpose);
3823
+ break;
3824
+ }
3825
+
3826
+ /* Deallocate space for distance matrix, if it was allocated by treecluster */
3827
+ if(ldistmatrix)
3828
+ { int i;
3829
+ for (i = 1; i < nelements; i++) free(distmatrix[i]);
3830
+ free (distmatrix);
3831
+ }
3832
+
3833
+ return result;
3834
+ }
3835
+
3836
+ /* ******************************************************************* */
3837
+
3838
+ static
3839
+ void somworker (int nrows, int ncolumns, double** data, int** mask,
3840
+ const double weights[], int transpose, int nxgrid, int nygrid,
3841
+ double inittau, double*** celldata, int niter, char dist)
3842
+
3843
+ { const int nelements = (transpose==0) ? nrows : ncolumns;
3844
+ const int ndata = (transpose==0) ? ncolumns : nrows;
3845
+ int i, j;
3846
+ double* stddata = calloc(nelements,sizeof(double));
3847
+ int** dummymask;
3848
+ int ix, iy;
3849
+ int* index;
3850
+ int iter;
3851
+ /* Maximum radius in which nodes are adjusted */
3852
+ double maxradius = sqrt(nxgrid*nxgrid+nygrid*nygrid);
3853
+
3854
+ /* Set the metric function as indicated by dist */
3855
+ double (*metric)
3856
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
3857
+ setmetric(dist);
3858
+
3859
+ /* Calculate the standard deviation for each row or column */
3860
+ if (transpose==0)
3861
+ { for (i = 0; i < nelements; i++)
3862
+ { int n = 0;
3863
+ for (j = 0; j < ndata; j++)
3864
+ { if (mask[i][j])
3865
+ { double term = data[i][j];
3866
+ term = term * term;
3867
+ stddata[i] += term;
3868
+ n++;
3869
+ }
3870
+ }
3871
+ if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
3872
+ else stddata[i] = 1;
3873
+ }
3874
+ }
3875
+ else
3876
+ { for (i = 0; i < nelements; i++)
3877
+ { int n = 0;
3878
+ for (j = 0; j < ndata; j++)
3879
+ { if (mask[j][i])
3880
+ { double term = data[j][i];
3881
+ term = term * term;
3882
+ stddata[i] += term;
3883
+ n++;
3884
+ }
3885
+ }
3886
+ if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
3887
+ else stddata[i] = 1;
3888
+ }
3889
+ }
3890
+
3891
+ if (transpose==0)
3892
+ { dummymask = malloc(nygrid*sizeof(int*));
3893
+ for (i = 0; i < nygrid; i++)
3894
+ { dummymask[i] = malloc(ndata*sizeof(int));
3895
+ for (j = 0; j < ndata; j++) dummymask[i][j] = 1;
3896
+ }
3897
+ }
3898
+ else
3899
+ { dummymask = malloc(ndata*sizeof(int*));
3900
+ for (i = 0; i < ndata; i++)
3901
+ { dummymask[i] = malloc(sizeof(int));
3902
+ dummymask[i][0] = 1;
3903
+ }
3904
+ }
3905
+
3906
+ /* Randomly initialize the nodes */
3907
+ for (ix = 0; ix < nxgrid; ix++)
3908
+ { for (iy = 0; iy < nygrid; iy++)
3909
+ { double sum = 0.;
3910
+ for (i = 0; i < ndata; i++)
3911
+ { double term = -1.0 + 2.0*uniform();
3912
+ celldata[ix][iy][i] = term;
3913
+ sum += term * term;
3914
+ }
3915
+ sum = sqrt(sum/ndata);
3916
+ for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
3917
+ }
3918
+ }
3919
+
3920
+ /* Randomize the order in which genes or arrays will be used */
3921
+ index = malloc(nelements*sizeof(int));
3922
+ for (i = 0; i < nelements; i++) index[i] = i;
3923
+ for (i = 0; i < nelements; i++)
3924
+ { j = (int) (i + (nelements-i)*uniform());
3925
+ ix = index[j];
3926
+ index[j] = index[i];
3927
+ index[i] = ix;
3928
+ }
3929
+
3930
+ /* Start the iteration */
3931
+ for (iter = 0; iter < niter; iter++)
3932
+ { int ixbest = 0;
3933
+ int iybest = 0;
3934
+ int iobject = iter % nelements;
3935
+ iobject = index[iobject];
3936
+ if (transpose==0)
3937
+ { double closest = metric(ndata,data,celldata[ixbest],
3938
+ mask,dummymask,weights,iobject,iybest,transpose);
3939
+ double radius = maxradius * (1. - ((double)iter)/((double)niter));
3940
+ double tau = inittau * (1. - ((double)iter)/((double)niter));
3941
+
3942
+ for (ix = 0; ix < nxgrid; ix++)
3943
+ { for (iy = 0; iy < nygrid; iy++)
3944
+ { double distance =
3945
+ metric (ndata,data,celldata[ix],
3946
+ mask,dummymask,weights,iobject,iy,transpose);
3947
+ if (distance < closest)
3948
+ { ixbest = ix;
3949
+ iybest = iy;
3950
+ closest = distance;
3951
+ }
3952
+ }
3953
+ }
3954
+ for (ix = 0; ix < nxgrid; ix++)
3955
+ { for (iy = 0; iy < nygrid; iy++)
3956
+ { if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))<radius)
3957
+ { double sum = 0.;
3958
+ for (i = 0; i < ndata; i++)
3959
+ { if (mask[iobject][i]==0) continue;
3960
+ celldata[ix][iy][i] +=
3961
+ tau * (data[iobject][i]/stddata[iobject]-celldata[ix][iy][i]);
3962
+ }
3963
+ for (i = 0; i < ndata; i++)
3964
+ { double term = celldata[ix][iy][i];
3965
+ term = term * term;
3966
+ sum += term;
3967
+ }
3968
+ if (sum>0)
3969
+ { sum = sqrt(sum/ndata);
3970
+ for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
3971
+ }
3972
+ }
3973
+ }
3974
+ }
3975
+ }
3976
+ else
3977
+ { double closest;
3978
+ double** celldatavector = malloc(ndata*sizeof(double*));
3979
+ double radius = maxradius * (1. - ((double)iter)/((double)niter));
3980
+ double tau = inittau * (1. - ((double)iter)/((double)niter));
3981
+
3982
+ for (i = 0; i < ndata; i++)
3983
+ celldatavector[i] = &(celldata[ixbest][iybest][i]);
3984
+ closest = metric(ndata,data,celldatavector,
3985
+ mask,dummymask,weights,iobject,0,transpose);
3986
+ for (ix = 0; ix < nxgrid; ix++)
3987
+ { for (iy = 0; iy < nygrid; iy++)
3988
+ { double distance;
3989
+ for (i = 0; i < ndata; i++)
3990
+ celldatavector[i] = &(celldata[ixbest][iybest][i]);
3991
+ distance =
3992
+ metric (ndata,data,celldatavector,
3993
+ mask,dummymask,weights,iobject,0,transpose);
3994
+ if (distance < closest)
3995
+ { ixbest = ix;
3996
+ iybest = iy;
3997
+ closest = distance;
3998
+ }
3999
+ }
4000
+ }
4001
+ free(celldatavector);
4002
+ for (ix = 0; ix < nxgrid; ix++)
4003
+ { for (iy = 0; iy < nygrid; iy++)
4004
+ { if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))<radius)
4005
+ { double sum = 0.;
4006
+ for (i = 0; i < ndata; i++)
4007
+ { if (mask[i][iobject]==0) continue;
4008
+ celldata[ix][iy][i] +=
4009
+ tau * (data[i][iobject]/stddata[iobject]-celldata[ix][iy][i]);
4010
+ }
4011
+ for (i = 0; i < ndata; i++)
4012
+ { double term = celldata[ix][iy][i];
4013
+ term = term * term;
4014
+ sum += term;
4015
+ }
4016
+ if (sum>0)
4017
+ { sum = sqrt(sum/ndata);
4018
+ for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
4019
+ }
4020
+ }
4021
+ }
4022
+ }
4023
+ }
4024
+ }
4025
+ if (transpose==0)
4026
+ for (i = 0; i < nygrid; i++) free(dummymask[i]);
4027
+ else
4028
+ for (i = 0; i < ndata; i++) free(dummymask[i]);
4029
+ free(dummymask);
4030
+ free(stddata);
4031
+ free(index);
4032
+ return;
4033
+ }
4034
+
4035
+ /* ******************************************************************* */
4036
+
4037
+ static
4038
+ void somassign (int nrows, int ncolumns, double** data, int** mask,
4039
+ const double weights[], int transpose, int nxgrid, int nygrid,
4040
+ double*** celldata, char dist, int clusterid[][2])
4041
+ /* Collect clusterids */
4042
+ { const int ndata = (transpose==0) ? ncolumns : nrows;
4043
+ int i,j;
4044
+
4045
+ /* Set the metric function as indicated by dist */
4046
+ double (*metric)
4047
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
4048
+ setmetric(dist);
4049
+
4050
+ if (transpose==0)
4051
+ { int** dummymask = malloc(nygrid*sizeof(int*));
4052
+ for (i = 0; i < nygrid; i++)
4053
+ { dummymask[i] = malloc(ncolumns*sizeof(int));
4054
+ for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1;
4055
+ }
4056
+ for (i = 0; i < nrows; i++)
4057
+ { int ixbest = 0;
4058
+ int iybest = 0;
4059
+ double closest = metric(ndata,data,celldata[ixbest],
4060
+ mask,dummymask,weights,i,iybest,transpose);
4061
+ int ix, iy;
4062
+ for (ix = 0; ix < nxgrid; ix++)
4063
+ { for (iy = 0; iy < nygrid; iy++)
4064
+ { double distance =
4065
+ metric (ndata,data,celldata[ix],
4066
+ mask,dummymask,weights,i,iy,transpose);
4067
+ if (distance < closest)
4068
+ { ixbest = ix;
4069
+ iybest = iy;
4070
+ closest = distance;
4071
+ }
4072
+ }
4073
+ }
4074
+ clusterid[i][0] = ixbest;
4075
+ clusterid[i][1] = iybest;
4076
+ }
4077
+ for (i = 0; i < nygrid; i++) free(dummymask[i]);
4078
+ free(dummymask);
4079
+ }
4080
+ else
4081
+ { double** celldatavector = malloc(ndata*sizeof(double*));
4082
+ int** dummymask = malloc(nrows*sizeof(int*));
4083
+ int ixbest = 0;
4084
+ int iybest = 0;
4085
+ for (i = 0; i < nrows; i++)
4086
+ { dummymask[i] = malloc(sizeof(int));
4087
+ dummymask[i][0] = 1;
4088
+ }
4089
+ for (i = 0; i < ncolumns; i++)
4090
+ { double closest;
4091
+ int ix, iy;
4092
+ for (j = 0; j < ndata; j++)
4093
+ celldatavector[j] = &(celldata[ixbest][iybest][j]);
4094
+ closest = metric(ndata,data,celldatavector,
4095
+ mask,dummymask,weights,i,0,transpose);
4096
+ for (ix = 0; ix < nxgrid; ix++)
4097
+ { for (iy = 0; iy < nygrid; iy++)
4098
+ { double distance;
4099
+ for(j = 0; j < ndata; j++)
4100
+ celldatavector[j] = &(celldata[ix][iy][j]);
4101
+ distance = metric(ndata,data,celldatavector,
4102
+ mask,dummymask,weights,i,0,transpose);
4103
+ if (distance < closest)
4104
+ { ixbest = ix;
4105
+ iybest = iy;
4106
+ closest = distance;
4107
+ }
4108
+ }
4109
+ }
4110
+ clusterid[i][0] = ixbest;
4111
+ clusterid[i][1] = iybest;
4112
+ }
4113
+ free(celldatavector);
4114
+ for (i = 0; i < nrows; i++) free(dummymask[i]);
4115
+ free(dummymask);
4116
+ }
4117
+ return;
4118
+ }
4119
+
4120
+ /* ******************************************************************* */
4121
+
4122
+ void somcluster (int nrows, int ncolumns, double** data, int** mask,
4123
+ const double weight[], int transpose, int nxgrid, int nygrid,
4124
+ double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
4125
+ /*
4126
+
4127
+ Purpose
4128
+ =======
4129
+
4130
+ The somcluster routine implements a self-organizing map (Kohonen) on a
4131
+ rectangular grid, using a given set of vectors. The distance measure to be
4132
+ used to find the similarity between genes and nodes is given by dist.
4133
+
4134
+ Arguments
4135
+ =========
4136
+
4137
+ nrows (input) int
4138
+ The number of rows in the data matrix, equal to the number of genes.
4139
+
4140
+ ncolumns (input) int
4141
+ The number of columns in the data matrix, equal to the number of microarrays.
4142
+
4143
+ data (input) double[nrows][ncolumns]
4144
+ The array containing the gene expression data.
4145
+
4146
+ mask (input) int[nrows][ncolumns]
4147
+ This array shows which data values are missing. If
4148
+ mask[i][j] == 0, then data[i][j] is missing.
4149
+
4150
+ weights (input) double[ncolumns] if transpose==0;
4151
+ double[nrows] if transpose==1
4152
+ The weights that are used to calculate the distance. The length of this vector
4153
+ is ncolumns if genes are being clustered, or nrows if microarrays are being
4154
+ clustered.
4155
+
4156
+ transpose (input) int
4157
+ If transpose==0, the rows (genes) of the matrix are clustered. Otherwise,
4158
+ columns (microarrays) of the matrix are clustered.
4159
+
4160
+ nxgrid (input) int
4161
+ The number of grid cells horizontally in the rectangular topology of clusters.
4162
+
4163
+ nygrid (input) int
4164
+ The number of grid cells horizontally in the rectangular topology of clusters.
4165
+
4166
+ inittau (input) double
4167
+ The initial value of tau, representing the neighborhood function.
4168
+
4169
+ niter (input) int
4170
+ The number of iterations to be performed.
4171
+
4172
+ dist (input) char
4173
+ Defines which distance measure is used, as given by the table:
4174
+ dist=='e': Euclidean distance
4175
+ dist=='b': City-block distance
4176
+ dist=='c': correlation
4177
+ dist=='a': absolute value of the correlation
4178
+ dist=='u': uncentered correlation
4179
+ dist=='x': absolute uncentered correlation
4180
+ dist=='s': Spearman's rank correlation
4181
+ dist=='k': Kendall's tau
4182
+ For other values of dist, the default (Euclidean distance) is used.
4183
+
4184
+ celldata (output) double[nxgrid][nygrid][ncolumns] if transpose==0;
4185
+ double[nxgrid][nygrid][nrows] if tranpose==1
4186
+ The gene expression data for each node (cell) in the 2D grid. This can be
4187
+ interpreted as the centroid for the cluster corresponding to that cell. If
4188
+ celldata is NULL, then the centroids are not returned. If celldata is not
4189
+ NULL, enough space should be allocated to store the centroid data before callingsomcluster.
4190
+
4191
+ clusterid (output), int[nrows][2] if transpose==0;
4192
+ int[ncolumns][2] if transpose==1
4193
+ For each item (gene or microarray) that is clustered, the coordinates of the
4194
+ cell in the 2D grid to which the item was assigned. If clusterid is NULL, the
4195
+ cluster assignments are not returned. If clusterid is not NULL, enough memory
4196
+ should be allocated to store the clustering information before calling
4197
+ somcluster.
4198
+
4199
+ ========================================================================
4200
+ */
4201
+ { const int nobjects = (transpose==0) ? nrows : ncolumns;
4202
+ const int ndata = (transpose==0) ? ncolumns : nrows;
4203
+ int i,j;
4204
+ const int lcelldata = (celldata==NULL) ? 0 : 1;
4205
+
4206
+ if (nobjects < 2) return;
4207
+
4208
+ if (lcelldata==0)
4209
+ { celldata = malloc(nxgrid*nygrid*ndata*sizeof(double**));
4210
+ for (i = 0; i < nxgrid; i++)
4211
+ { celldata[i] = malloc(nygrid*ndata*sizeof(double*));
4212
+ for (j = 0; j < nygrid; j++)
4213
+ celldata[i][j] = malloc(ndata*sizeof(double));
4214
+ }
4215
+ }
4216
+
4217
+ somworker (nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid,
4218
+ inittau, celldata, niter, dist);
4219
+ if (clusterid)
4220
+ somassign (nrows, ncolumns, data, mask, weight, transpose,
4221
+ nxgrid, nygrid, celldata, dist, clusterid);
4222
+ if(lcelldata==0)
4223
+ { for (i = 0; i < nxgrid; i++)
4224
+ for (j = 0; j < nygrid; j++)
4225
+ free(celldata[i][j]);
4226
+ for (i = 0; i < nxgrid; i++)
4227
+ free(celldata[i]);
4228
+ free(celldata);
4229
+ }
4230
+ return;
4231
+ }
4232
+
4233
+ /* ******************************************************************** */
4234
+
4235
+ double clusterdistance (int nrows, int ncolumns, double** data,
4236
+ int** mask, double weight[], int n1, int n2, int index1[], int index2[],
4237
+ char dist, char method, int transpose)
4238
+
4239
+ /*
4240
+ Purpose
4241
+ =======
4242
+
4243
+ The clusterdistance routine calculates the distance between two clusters
4244
+ containing genes or microarrays using the measured gene expression vectors. The
4245
+ distance between clusters, given the genes/microarrays in each cluster, can be
4246
+ defined in several ways. Several distance measures can be used.
4247
+
4248
+ The routine returns the distance in double precision.
4249
+ If the parameter transpose is set to a nonzero value, the clusters are
4250
+ interpreted as clusters of microarrays, otherwise as clusters of gene.
4251
+
4252
+ Arguments
4253
+ =========
4254
+
4255
+ nrows (input) int
4256
+ The number of rows (i.e., the number of genes) in the gene expression data
4257
+ matrix.
4258
+
4259
+ ncolumns (input) int
4260
+ The number of columns (i.e., the number of microarrays) in the gene expression
4261
+ data matrix.
4262
+
4263
+ data (input) double[nrows][ncolumns]
4264
+ The array containing the data of the vectors.
4265
+
4266
+ mask (input) int[nrows][ncolumns]
4267
+ This array shows which data values are missing. If mask[i][j]==0, then
4268
+ data[i][j] is missing.
4269
+
4270
+ weight (input) double[ncolumns] if transpose==0;
4271
+ double[nrows] if transpose==1
4272
+ The weights that are used to calculate the distance.
4273
+
4274
+ n1 (input) int
4275
+ The number of elements in the first cluster.
4276
+
4277
+ n2 (input) int
4278
+ The number of elements in the second cluster.
4279
+
4280
+ index1 (input) int[n1]
4281
+ Identifies which genes/microarrays belong to the first cluster.
4282
+
4283
+ index2 (input) int[n2]
4284
+ Identifies which genes/microarrays belong to the second cluster.
4285
+
4286
+ dist (input) char
4287
+ Defines which distance measure is used, as given by the table:
4288
+ dist=='e': Euclidean distance
4289
+ dist=='b': City-block distance
4290
+ dist=='c': correlation
4291
+ dist=='a': absolute value of the correlation
4292
+ dist=='u': uncentered correlation
4293
+ dist=='x': absolute uncentered correlation
4294
+ dist=='s': Spearman's rank correlation
4295
+ dist=='k': Kendall's tau
4296
+ For other values of dist, the default (Euclidean distance) is used.
4297
+
4298
+ method (input) char
4299
+ Defines how the distance between two clusters is defined, given which genes
4300
+ belong to which cluster:
4301
+ method=='a': the distance between the arithmetic means of the two clusters
4302
+ method=='m': the distance between the medians of the two clusters
4303
+ method=='s': the smallest pairwise distance between members of the two clusters
4304
+ method=='x': the largest pairwise distance between members of the two clusters
4305
+ method=='v': average of the pairwise distances between members of the clusters
4306
+
4307
+ transpose (input) int
4308
+ If transpose is equal to zero, the distances between the rows is
4309
+ calculated. Otherwise, the distances between the columns is calculated.
4310
+ The former is needed when genes are being clustered; the latter is used
4311
+ when microarrays are being clustered.
4312
+
4313
+ ========================================================================
4314
+ */
4315
+ { /* Set the metric function as indicated by dist */
4316
+ double (*metric)
4317
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
4318
+ setmetric(dist);
4319
+
4320
+ /* if one or both clusters are empty, return */
4321
+ if (n1 < 1 || n2 < 1) return -1.0;
4322
+ /* Check the indices */
4323
+ if (transpose==0)
4324
+ { int i;
4325
+ for (i = 0; i < n1; i++)
4326
+ { int index = index1[i];
4327
+ if (index < 0 || index >= nrows) return -1.0;
4328
+ }
4329
+ for (i = 0; i < n2; i++)
4330
+ { int index = index2[i];
4331
+ if (index < 0 || index >= nrows) return -1.0;
4332
+ }
4333
+ }
4334
+ else
4335
+ { int i;
4336
+ for (i = 0; i < n1; i++)
4337
+ { int index = index1[i];
4338
+ if (index < 0 || index >= ncolumns) return -1.0;
4339
+ }
4340
+ for (i = 0; i < n2; i++)
4341
+ { int index = index2[i];
4342
+ if (index < 0 || index >= ncolumns) return -1.0;
4343
+ }
4344
+ }
4345
+
4346
+ switch (method)
4347
+ { case 'a':
4348
+ { /* Find the center */
4349
+ int i,j,k;
4350
+ if (transpose==0)
4351
+ { double distance;
4352
+ double* cdata[2];
4353
+ int* cmask[2];
4354
+ int* count[2];
4355
+ count[0] = calloc(ncolumns,sizeof(int));
4356
+ count[1] = calloc(ncolumns,sizeof(int));
4357
+ cdata[0] = calloc(ncolumns,sizeof(double));
4358
+ cdata[1] = calloc(ncolumns,sizeof(double));
4359
+ cmask[0] = malloc(ncolumns*sizeof(int));
4360
+ cmask[1] = malloc(ncolumns*sizeof(int));
4361
+ for (i = 0; i < n1; i++)
4362
+ { k = index1[i];
4363
+ for (j = 0; j < ncolumns; j++)
4364
+ if (mask[k][j] != 0)
4365
+ { cdata[0][j] = cdata[0][j] + data[k][j];
4366
+ count[0][j] = count[0][j] + 1;
4367
+ }
4368
+ }
4369
+ for (i = 0; i < n2; i++)
4370
+ { k = index2[i];
4371
+ for (j = 0; j < ncolumns; j++)
4372
+ if (mask[k][j] != 0)
4373
+ { cdata[1][j] = cdata[1][j] + data[k][j];
4374
+ count[1][j] = count[1][j] + 1;
4375
+ }
4376
+ }
4377
+ for (i = 0; i < 2; i++)
4378
+ for (j = 0; j < ncolumns; j++)
4379
+ { if (count[i][j]>0)
4380
+ { cdata[i][j] = cdata[i][j] / count[i][j];
4381
+ cmask[i][j] = 1;
4382
+ }
4383
+ else
4384
+ cmask[i][j] = 0;
4385
+ }
4386
+ distance =
4387
+ metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
4388
+ for (i = 0; i < 2; i++)
4389
+ { free (cdata[i]);
4390
+ free (cmask[i]);
4391
+ free (count[i]);
4392
+ }
4393
+ return distance;
4394
+ }
4395
+ else
4396
+ { double distance;
4397
+ int** count = malloc(nrows*sizeof(int*));
4398
+ double** cdata = malloc(nrows*sizeof(double*));
4399
+ int** cmask = malloc(nrows*sizeof(int*));
4400
+ for (i = 0; i < nrows; i++)
4401
+ { count[i] = calloc(2,sizeof(int));
4402
+ cdata[i] = calloc(2,sizeof(double));
4403
+ cmask[i] = malloc(2*sizeof(int));
4404
+ }
4405
+ for (i = 0; i < n1; i++)
4406
+ { k = index1[i];
4407
+ for (j = 0; j < nrows; j++)
4408
+ { if (mask[j][k] != 0)
4409
+ { cdata[j][0] = cdata[j][0] + data[j][k];
4410
+ count[j][0] = count[j][0] + 1;
4411
+ }
4412
+ }
4413
+ }
4414
+ for (i = 0; i < n2; i++)
4415
+ { k = index2[i];
4416
+ for (j = 0; j < nrows; j++)
4417
+ { if (mask[j][k] != 0)
4418
+ { cdata[j][1] = cdata[j][1] + data[j][k];
4419
+ count[j][1] = count[j][1] + 1;
4420
+ }
4421
+ }
4422
+ }
4423
+ for (i = 0; i < nrows; i++)
4424
+ for (j = 0; j < 2; j++)
4425
+ if (count[i][j]>0)
4426
+ { cdata[i][j] = cdata[i][j] / count[i][j];
4427
+ cmask[i][j] = 1;
4428
+ }
4429
+ else
4430
+ cmask[i][j] = 0;
4431
+ distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
4432
+ for (i = 0; i < nrows; i++)
4433
+ { free (count[i]);
4434
+ free (cdata[i]);
4435
+ free (cmask[i]);
4436
+ }
4437
+ free (count);
4438
+ free (cdata);
4439
+ free (cmask);
4440
+ return distance;
4441
+ }
4442
+ }
4443
+ case 'm':
4444
+ { int i, j, k;
4445
+ if (transpose==0)
4446
+ { double distance;
4447
+ double* temp = malloc(nrows*sizeof(double));
4448
+ double* cdata[2];
4449
+ int* cmask[2];
4450
+ for (i = 0; i < 2; i++)
4451
+ { cdata[i] = malloc(ncolumns*sizeof(double));
4452
+ cmask[i] = malloc(ncolumns*sizeof(int));
4453
+ }
4454
+ for (j = 0; j < ncolumns; j++)
4455
+ { int count = 0;
4456
+ for (k = 0; k < n1; k++)
4457
+ { i = index1[k];
4458
+ if (mask[i][j])
4459
+ { temp[count] = data[i][j];
4460
+ count++;
4461
+ }
4462
+ }
4463
+ if (count>0)
4464
+ { cdata[0][j] = median (count,temp);
4465
+ cmask[0][j] = 1;
4466
+ }
4467
+ else
4468
+ { cdata[0][j] = 0.;
4469
+ cmask[0][j] = 0;
4470
+ }
4471
+ }
4472
+ for (j = 0; j < ncolumns; j++)
4473
+ { int count = 0;
4474
+ for (k = 0; k < n2; k++)
4475
+ { i = index2[k];
4476
+ if (mask[i][j])
4477
+ { temp[count] = data[i][j];
4478
+ count++;
4479
+ }
4480
+ }
4481
+ if (count>0)
4482
+ { cdata[1][j] = median (count,temp);
4483
+ cmask[1][j] = 1;
4484
+ }
4485
+ else
4486
+ { cdata[1][j] = 0.;
4487
+ cmask[1][j] = 0;
4488
+ }
4489
+ }
4490
+ distance = metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
4491
+ for (i = 0; i < 2; i++)
4492
+ { free (cdata[i]);
4493
+ free (cmask[i]);
4494
+ }
4495
+ free(temp);
4496
+ return distance;
4497
+ }
4498
+ else
4499
+ { double distance;
4500
+ double* temp = malloc(ncolumns*sizeof(double));
4501
+ double** cdata = malloc(nrows*sizeof(double*));
4502
+ int** cmask = malloc(nrows*sizeof(int*));
4503
+ for (i = 0; i < nrows; i++)
4504
+ { cdata[i] = malloc(2*sizeof(double));
4505
+ cmask[i] = malloc(2*sizeof(int));
4506
+ }
4507
+ for (j = 0; j < nrows; j++)
4508
+ { int count = 0;
4509
+ for (k = 0; k < n1; k++)
4510
+ { i = index1[k];
4511
+ if (mask[j][i])
4512
+ { temp[count] = data[j][i];
4513
+ count++;
4514
+ }
4515
+ }
4516
+ if (count>0)
4517
+ { cdata[j][0] = median (count,temp);
4518
+ cmask[j][0] = 1;
4519
+ }
4520
+ else
4521
+ { cdata[j][0] = 0.;
4522
+ cmask[j][0] = 0;
4523
+ }
4524
+ }
4525
+ for (j = 0; j < nrows; j++)
4526
+ { int count = 0;
4527
+ for (k = 0; k < n2; k++)
4528
+ { i = index2[k];
4529
+ if (mask[j][i])
4530
+ { temp[count] = data[j][i];
4531
+ count++;
4532
+ }
4533
+ }
4534
+ if (count>0)
4535
+ { cdata[j][1] = median (count,temp);
4536
+ cmask[j][1] = 1;
4537
+ }
4538
+ else
4539
+ { cdata[j][1] = 0.;
4540
+ cmask[j][1] = 0;
4541
+ }
4542
+ }
4543
+ distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
4544
+ for (i = 0; i < nrows; i++)
4545
+ { free (cdata[i]);
4546
+ free (cmask[i]);
4547
+ }
4548
+ free(cdata);
4549
+ free(cmask);
4550
+ free(temp);
4551
+ return distance;
4552
+ }
4553
+ }
4554
+ case 's':
4555
+ { int i1, i2, j1, j2;
4556
+ const int n = (transpose==0) ? ncolumns : nrows;
4557
+ double mindistance = DBL_MAX;
4558
+ for (i1 = 0; i1 < n1; i1++)
4559
+ for (i2 = 0; i2 < n2; i2++)
4560
+ { double distance;
4561
+ j1 = index1[i1];
4562
+ j2 = index2[i2];
4563
+ distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4564
+ if (distance < mindistance) mindistance = distance;
4565
+ }
4566
+ return mindistance;
4567
+ }
4568
+ case 'x':
4569
+ { int i1, i2, j1, j2;
4570
+ const int n = (transpose==0) ? ncolumns : nrows;
4571
+ double maxdistance = 0;
4572
+ for (i1 = 0; i1 < n1; i1++)
4573
+ for (i2 = 0; i2 < n2; i2++)
4574
+ { double distance;
4575
+ j1 = index1[i1];
4576
+ j2 = index2[i2];
4577
+ distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4578
+ if (distance > maxdistance) maxdistance = distance;
4579
+ }
4580
+ return maxdistance;
4581
+ }
4582
+ case 'v':
4583
+ { int i1, i2, j1, j2;
4584
+ const int n = (transpose==0) ? ncolumns : nrows;
4585
+ double distance = 0;
4586
+ for (i1 = 0; i1 < n1; i1++)
4587
+ for (i2 = 0; i2 < n2; i2++)
4588
+ { j1 = index1[i1];
4589
+ j2 = index2[i2];
4590
+ distance += metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4591
+ }
4592
+ distance /= (n1*n2);
4593
+ return distance;
4594
+ }
4595
+ }
4596
+ /* Never get here */
4597
+ return -2.0;
4598
+ }