rbcluster 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ *.rbc
6
+ *.bundle
7
+ *.so
8
+ tmp/
9
+ .tm_properties
@@ -0,0 +1,6 @@
1
+ rvm:
2
+ - 1.8.7
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - ruby-head
6
+ - rbx
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in rbcluster.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,29 @@
1
+ rbcluster
2
+ Copyright (C) 2011-2012 Jari Bakken.
3
+
4
+ The C clustering library.
5
+ Copyright (C) 2002 Michiel Jan Laurens de Hoon.
6
+
7
+ This library was written at the Laboratory of DNA Information Analysis,
8
+ Human Genome Center, Institute of Medical Science, University of Tokyo,
9
+ 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
10
+ Contact: mdehoon 'AT' gsc.riken.jp
11
+
12
+ Permission to use, copy, modify, and distribute this software and its
13
+ documentation with or without modifications and for any purpose and
14
+ without fee is hereby granted, provided that any copyright notices
15
+ appear in all copies and that both those copyright notices and this
16
+ permission notice appear in supporting documentation, and that the
17
+ names of the contributors or copyright holders not be used in
18
+ advertising or publicity pertaining to distribution of the software
19
+ without specific prior permission.
20
+
21
+ THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
22
+ WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
23
+ WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
24
+ CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
25
+ OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
26
+ OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
27
+ OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
28
+ OR PERFORMANCE OF THIS SOFTWARE.
29
+
@@ -0,0 +1,54 @@
1
+ rbcluster
2
+ =========
3
+
4
+ Ruby bindings to the Cluster C library.
5
+
6
+ TODO
7
+ ----
8
+
9
+ Functions:
10
+
11
+ * Cluster.clustercentroids
12
+ * Cluster.clustermedoids
13
+ * Cluster::Tree#{cut,slice,[],fetch}
14
+
15
+ Other:
16
+
17
+ * an examples/ folder
18
+ * make :transpose work
19
+ * specs for bad inputs
20
+
21
+ DONE
22
+ ----
23
+
24
+ * Cluster.pca
25
+ * Cluster.somcluster
26
+ * Cluster.treecluster
27
+ * Cluster.clusterdistance
28
+ * Cluster.kcluster
29
+ * Cluster.kmedoids
30
+ * Cluster.distancematrix
31
+ * Cluster.median
32
+ * Cluster.mean
33
+
34
+ See also
35
+ --------
36
+
37
+ * http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm
38
+ * http://bonsai.hgc.jp/~mdehoon/software/cluster/cluster.pdf
39
+
40
+ Note on Patches/Pull Requests
41
+ -----------------------------
42
+
43
+ * Fork the project.
44
+ * Make your feature addition or bug fix.
45
+ * Add tests for it. This is important so I don't break it in a
46
+ future version unintentionally.
47
+ * Commit, do not mess with rakefile, version, or history.
48
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
49
+ * Send me a pull request. Bonus points for topic branches.
50
+
51
+ Copyright
52
+ ---------
53
+
54
+ Copyright (c) 2011-2012 Jari Bakken. See LICENSE for details.
@@ -0,0 +1,17 @@
1
+ include Rake::DSL if defined?(Rake::DSL)
2
+
3
+ require 'bundler'
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ require 'rake/extensiontask'
7
+ Rake::ExtensionTask.new do |ext|
8
+ ext.name = "rbcluster"
9
+ ext.lib_dir = "lib/rbcluster"
10
+ ext.ext_dir = "ext/rbcluster"
11
+ ext.gem_spec = eval(File.read("./rbcluster.gemspec"))
12
+ end
13
+
14
+ require 'rspec/core/rake_task'
15
+ RSpec::Core::RakeTask.new
16
+
17
+ task :default => [:compile, :spec]
@@ -0,0 +1,10 @@
1
+ require 'rbcluster'
2
+
3
+ data = [
4
+ [1, 1, 0],
5
+ [1, 0, 0],
6
+ [0, 0, 0]
7
+ ]
8
+
9
+ labels, error, nfound = Cluster.kcluster(data, :clusters => 2)
10
+ p [labels, error, nfound]
@@ -0,0 +1,4598 @@
1
+ /* The C clustering library.
2
+ * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
3
+ *
4
+ * This library was written at the Laboratory of DNA Information Analysis,
5
+ * Human Genome Center, Institute of Medical Science, University of Tokyo,
6
+ * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
7
+ * Contact: mdehoon 'AT' gsc.riken.jp
8
+ *
9
+ * Permission to use, copy, modify, and distribute this software and its
10
+ * documentation with or without modifications and for any purpose and
11
+ * without fee is hereby granted, provided that any copyright notices
12
+ * appear in all copies and that both those copyright notices and this
13
+ * permission notice appear in supporting documentation, and that the
14
+ * names of the contributors or copyright holders not be used in
15
+ * advertising or publicity pertaining to distribution of the software
16
+ * without specific prior permission.
17
+ *
18
+ * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
19
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
20
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
21
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
22
+ * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25
+ * OR PERFORMANCE OF THIS SOFTWARE.
26
+ *
27
+ */
28
+
29
+ #include <time.h>
30
+ #include <stdlib.h>
31
+ #include <math.h>
32
+ #include <float.h>
33
+ #include <limits.h>
34
+ #include <string.h>
35
+ #include "cluster.h"
36
+ #ifdef WINDOWS
37
+ # include <windows.h>
38
+ #endif
39
+
40
+ /* ************************************************************************ */
41
+
42
+ #ifdef WINDOWS
43
+ /* Then we make a Windows DLL */
44
+ int WINAPI
45
+ clusterdll_init (HANDLE h, DWORD reason, void* foo)
46
+ {
47
+ return 1;
48
+ }
49
+ #endif
50
+
51
+ /* ************************************************************************ */
52
+
53
+ double mean(int n, double x[])
54
+ { double result = 0.;
55
+ int i;
56
+ for (i = 0; i < n; i++) result += x[i];
57
+ result /= n;
58
+ return result;
59
+ }
60
+
61
+ /* ************************************************************************ */
62
+
63
+ double median (int n, double x[])
64
+ /*
65
+ Find the median of X(1), ... , X(N), using as much of the quicksort
66
+ algorithm as is needed to isolate it.
67
+ N.B. On exit, the array X is partially ordered.
68
+ Based on Alan J. Miller's median.f90 routine.
69
+ */
70
+
71
+ { int i, j;
72
+ int nr = n / 2;
73
+ int nl = nr - 1;
74
+ int even = 0;
75
+ /* hi & lo are position limits encompassing the median. */
76
+ int lo = 0;
77
+ int hi = n-1;
78
+
79
+ if (n==2*nr) even = 1;
80
+ if (n<3)
81
+ { if (n<1) return 0.;
82
+ if (n == 1) return x[0];
83
+ return 0.5*(x[0]+x[1]);
84
+ }
85
+
86
+ /* Find median of 1st, middle & last values. */
87
+ do
88
+ { int loop;
89
+ int mid = (lo + hi)/2;
90
+ double result = x[mid];
91
+ double xlo = x[lo];
92
+ double xhi = x[hi];
93
+ if (xhi<xlo)
94
+ { double temp = xlo;
95
+ xlo = xhi;
96
+ xhi = temp;
97
+ }
98
+ if (result>xhi) result = xhi;
99
+ else if (result<xlo) result = xlo;
100
+ /* The basic quicksort algorithm to move all values <= the sort key (XMED)
101
+ * to the left-hand end, and all higher values to the other end.
102
+ */
103
+ i = lo;
104
+ j = hi;
105
+ do
106
+ { while (x[i]<result) i++;
107
+ while (x[j]>result) j--;
108
+ loop = 0;
109
+ if (i<j)
110
+ { double temp = x[i];
111
+ x[i] = x[j];
112
+ x[j] = temp;
113
+ i++;
114
+ j--;
115
+ if (i<=j) loop = 1;
116
+ }
117
+ } while (loop); /* Decide which half the median is in. */
118
+
119
+ if (even)
120
+ { if (j==nl && i==nr)
121
+ /* Special case, n even, j = n/2 & i = j + 1, so the median is
122
+ * between the two halves of the series. Find max. of the first
123
+ * half & min. of the second half, then average.
124
+ */
125
+ { int k;
126
+ double xmax = x[0];
127
+ double xmin = x[n-1];
128
+ for (k = lo; k <= j; k++) xmax = max(xmax,x[k]);
129
+ for (k = i; k <= hi; k++) xmin = min(xmin,x[k]);
130
+ return 0.5*(xmin + xmax);
131
+ }
132
+ if (j<nl) lo = i;
133
+ if (i>nr) hi = j;
134
+ if (i==j)
135
+ { if (i==nl) lo = nl;
136
+ if (j==nr) hi = nr;
137
+ }
138
+ }
139
+ else
140
+ { if (j<nr) lo = i;
141
+ if (i>nr) hi = j;
142
+ /* Test whether median has been isolated. */
143
+ if (i==j && i==nr) return result;
144
+ }
145
+ }
146
+ while (lo<hi-1);
147
+
148
+ if (even) return (0.5*(x[nl]+x[nr]));
149
+ if (x[lo]>x[hi])
150
+ { double temp = x[lo];
151
+ x[lo] = x[hi];
152
+ x[hi] = temp;
153
+ }
154
+ return x[nr];
155
+ }
156
+
157
+ /* ********************************************************************** */
158
+
159
+ static const double* sortdata = NULL; /* used in the quicksort algorithm */
160
+
161
+ /* ---------------------------------------------------------------------- */
162
+
163
+ static
164
+ int compare(const void* a, const void* b)
165
+ /* Helper function for sort. Previously, this was a nested function under
166
+ * sort, which is not allowed under ANSI C.
167
+ */
168
+ { const int i1 = *(const int*)a;
169
+ const int i2 = *(const int*)b;
170
+ const double term1 = sortdata[i1];
171
+ const double term2 = sortdata[i2];
172
+ if (term1 < term2) return -1;
173
+ if (term1 > term2) return +1;
174
+ return 0;
175
+ }
176
+
177
+ /* ---------------------------------------------------------------------- */
178
+
179
+ void sort(int n, const double data[], int index[])
180
+ /* Sets up an index table given the data, such that data[index[]] is in
181
+ * increasing order. Sorting is done on the indices; the array data
182
+ * is unchanged.
183
+ */
184
+ { int i;
185
+ sortdata = data;
186
+ for (i = 0; i < n; i++) index[i] = i;
187
+ qsort(index, n, sizeof(int), compare);
188
+ }
189
+
190
+ /* ********************************************************************** */
191
+
192
+ static double* getrank (int n, double data[])
193
+ /* Calculates the ranks of the elements in the array data. Two elements with
194
+ * the same value get the same rank, equal to the average of the ranks had the
195
+ * elements different values. The ranks are returned as a newly allocated
196
+ * array that should be freed by the calling routine. If getrank fails due to
197
+ * a memory allocation error, it returns NULL.
198
+ */
199
+ { int i;
200
+ double* rank;
201
+ int* index;
202
+ rank = malloc(n*sizeof(double));
203
+ if (!rank) return NULL;
204
+ index = malloc(n*sizeof(int));
205
+ if (!index)
206
+ { free(rank);
207
+ return NULL;
208
+ }
209
+ /* Call sort to get an index table */
210
+ sort (n, data, index);
211
+ /* Build a rank table */
212
+ for (i = 0; i < n; i++) rank[index[i]] = i;
213
+ /* Fix for equal ranks */
214
+ i = 0;
215
+ while (i < n)
216
+ { int m;
217
+ double value = data[index[i]];
218
+ int j = i + 1;
219
+ while (j < n && data[index[j]] == value) j++;
220
+ m = j - i; /* number of equal ranks found */
221
+ value = rank[index[i]] + (m-1)/2.;
222
+ for (j = i; j < i + m; j++) rank[index[j]] = value;
223
+ i += m;
224
+ }
225
+ free (index);
226
+ return rank;
227
+ }
228
+
229
+ /* ---------------------------------------------------------------------- */
230
+
231
+ static int
232
+ makedatamask(int nrows, int ncols, double*** pdata, int*** pmask)
233
+ { int i;
234
+ double** data;
235
+ int** mask;
236
+ data = malloc(nrows*sizeof(double*));
237
+ if(!data) return 0;
238
+ mask = malloc(nrows*sizeof(int*));
239
+ if(!mask)
240
+ { free(data);
241
+ return 0;
242
+ }
243
+ for (i = 0; i < nrows; i++)
244
+ { data[i] = malloc(ncols*sizeof(double));
245
+ if(!data[i]) break;
246
+ mask[i] = malloc(ncols*sizeof(int));
247
+ if(!mask[i])
248
+ { free(data[i]);
249
+ break;
250
+ }
251
+ }
252
+ if (i==nrows) /* break not encountered */
253
+ { *pdata = data;
254
+ *pmask = mask;
255
+ return 1;
256
+ }
257
+ *pdata = NULL;
258
+ *pmask = NULL;
259
+ nrows = i;
260
+ for (i = 0; i < nrows; i++)
261
+ { free(data[i]);
262
+ free(mask[i]);
263
+ }
264
+ free(data);
265
+ free(mask);
266
+ return 0;
267
+ }
268
+
269
+ /* ---------------------------------------------------------------------- */
270
+
271
+ static void
272
+ freedatamask(int n, double** data, int** mask)
273
+ { int i;
274
+ for (i = 0; i < n; i++)
275
+ { free(mask[i]);
276
+ free(data[i]);
277
+ }
278
+ free(mask);
279
+ free(data);
280
+ }
281
+
282
+ /* ---------------------------------------------------------------------- */
283
+
284
+ static
285
+ double find_closest_pair(int n, double** distmatrix, int* ip, int* jp)
286
+ /*
287
+ This function searches the distance matrix to find the pair with the shortest
288
+ distance between them. The indices of the pair are returned in ip and jp; the
289
+ distance itself is returned by the function.
290
+
291
+ n (input) int
292
+ The number of elements in the distance matrix.
293
+
294
+ distmatrix (input) double**
295
+ A ragged array containing the distance matrix. The number of columns in each
296
+ row is one less than the row index.
297
+
298
+ ip (output) int*
299
+ A pointer to the integer that is to receive the first index of the pair with
300
+ the shortest distance.
301
+
302
+ jp (output) int*
303
+ A pointer to the integer that is to receive the second index of the pair with
304
+ the shortest distance.
305
+ */
306
+ { int i, j;
307
+ double temp;
308
+ double distance = distmatrix[1][0];
309
+ *ip = 1;
310
+ *jp = 0;
311
+ for (i = 1; i < n; i++)
312
+ { for (j = 0; j < i; j++)
313
+ { temp = distmatrix[i][j];
314
+ if (temp<distance)
315
+ { distance = temp;
316
+ *ip = i;
317
+ *jp = j;
318
+ }
319
+ }
320
+ }
321
+ return distance;
322
+ }
323
+
324
+ /* ********************************************************************* */
325
+
326
+ static int svd(int m, int n, double** u, double w[], double** vt)
327
+ /*
328
+ * This subroutine is a translation of the Algol procedure svd,
329
+ * Num. Math. 14, 403-420(1970) by Golub and Reinsch.
330
+ * Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971).
331
+ *
332
+ * This subroutine determines the singular value decomposition
333
+ * t
334
+ * A=usv of a real m by n rectangular matrix, where m is greater
335
+ * than or equal to n. Householder bidiagonalization and a variant
336
+ * of the QR algorithm are used.
337
+ *
338
+ *
339
+ * On input.
340
+ *
341
+ * m is the number of rows of A (and u).
342
+ *
343
+ * n is the number of columns of A (and u) and the order of v.
344
+ *
345
+ * u contains the rectangular input matrix A to be decomposed.
346
+ *
347
+ * On output.
348
+ *
349
+ * the routine returns an integer ierr equal to
350
+ * 0 to indicate a normal return,
351
+ * k if the k-th singular value has not been
352
+ * determined after 30 iterations,
353
+ * -1 if memory allocation fails.
354
+ *
355
+ *
356
+ * w contains the n (non-negative) singular values of a (the
357
+ * diagonal elements of s). they are unordered. if an
358
+ * error exit is made, the singular values should be correct
359
+ * for indices ierr+1,ierr+2,...,n.
360
+ *
361
+ *
362
+ * u contains the matrix u (orthogonal column vectors) of the
363
+ * decomposition.
364
+ * if an error exit is made, the columns of u corresponding
365
+ * to indices of correct singular values should be correct.
366
+ *
367
+ * t
368
+ * vt contains the matrix v (orthogonal) of the decomposition.
369
+ * if an error exit is made, the columns of v corresponding
370
+ * to indices of correct singular values should be correct.
371
+ *
372
+ *
373
+ * Questions and comments should be directed to B. S. Garbow,
374
+ * Applied Mathematics division, Argonne National Laboratory
375
+ *
376
+ * Modified to eliminate machep
377
+ *
378
+ * Translated to C by Michiel de Hoon, Human Genome Center,
379
+ * University of Tokyo, for inclusion in the C Clustering Library.
380
+ * This routine is less general than the original svd routine, as
381
+ * it focuses on the singular value decomposition as needed for
382
+ * clustering. In particular,
383
+ * - We calculate both u and v in all cases
384
+ * - We pass the input array A via u; this array is subsequently
385
+ * overwritten.
386
+ * - We allocate for the array rv1, used as a working space,
387
+ * internally in this routine, instead of passing it as an
388
+ * argument. If the allocation fails, svd returns -1.
389
+ * 2003.06.05
390
+ */
391
+ { int i, j, k, i1, k1, l1, its;
392
+ double c,f,h,s,x,y,z;
393
+ int l = 0;
394
+ int ierr = 0;
395
+ double g = 0.0;
396
+ double scale = 0.0;
397
+ double anorm = 0.0;
398
+ double* rv1 = malloc(n*sizeof(double));
399
+ if (!rv1) return -1;
400
+ if (m >= n)
401
+ { /* Householder reduction to bidiagonal form */
402
+ for (i = 0; i < n; i++)
403
+ { l = i + 1;
404
+ rv1[i] = scale * g;
405
+ g = 0.0;
406
+ s = 0.0;
407
+ scale = 0.0;
408
+ for (k = i; k < m; k++) scale += fabs(u[k][i]);
409
+ if (scale != 0.0)
410
+ { for (k = i; k < m; k++)
411
+ { u[k][i] /= scale;
412
+ s += u[k][i]*u[k][i];
413
+ }
414
+ f = u[i][i];
415
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
416
+ h = f * g - s;
417
+ u[i][i] = f - g;
418
+ if (i < n-1)
419
+ { for (j = l; j < n; j++)
420
+ { s = 0.0;
421
+ for (k = i; k < m; k++) s += u[k][i] * u[k][j];
422
+ f = s / h;
423
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
424
+ }
425
+ }
426
+ for (k = i; k < m; k++) u[k][i] *= scale;
427
+ }
428
+ w[i] = scale * g;
429
+ g = 0.0;
430
+ s = 0.0;
431
+ scale = 0.0;
432
+ if (i<n-1)
433
+ { for (k = l; k < n; k++) scale += fabs(u[i][k]);
434
+ if (scale != 0.0)
435
+ { for (k = l; k < n; k++)
436
+ { u[i][k] /= scale;
437
+ s += u[i][k] * u[i][k];
438
+ }
439
+ f = u[i][l];
440
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
441
+ h = f * g - s;
442
+ u[i][l] = f - g;
443
+ for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
444
+ for (j = l; j < m; j++)
445
+ { s = 0.0;
446
+ for (k = l; k < n; k++) s += u[j][k] * u[i][k];
447
+ for (k = l; k < n; k++) u[j][k] += s * rv1[k];
448
+ }
449
+ for (k = l; k < n; k++) u[i][k] *= scale;
450
+ }
451
+ }
452
+ anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
453
+ }
454
+ /* accumulation of right-hand transformations */
455
+ for (i = n-1; i>=0; i--)
456
+ { if (i < n-1)
457
+ { if (g != 0.0)
458
+ { for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g;
459
+ /* double division avoids possible underflow */
460
+ for (j = l; j < n; j++)
461
+ { s = 0.0;
462
+ for (k = l; k < n; k++) s += u[i][k] * vt[j][k];
463
+ for (k = l; k < n; k++) vt[j][k] += s * vt[i][k];
464
+ }
465
+ }
466
+ }
467
+ for (j = l; j < n; j++)
468
+ { vt[j][i] = 0.0;
469
+ vt[i][j] = 0.0;
470
+ }
471
+ vt[i][i] = 1.0;
472
+ g = rv1[i];
473
+ l = i;
474
+ }
475
+ /* accumulation of left-hand transformations */
476
+ for (i = n-1; i >= 0; i--)
477
+ { l = i + 1;
478
+ g = w[i];
479
+ if (i!=n-1)
480
+ for (j = l; j < n; j++) u[i][j] = 0.0;
481
+ if (g!=0.0)
482
+ { if (i!=n-1)
483
+ { for (j = l; j < n; j++)
484
+ { s = 0.0;
485
+ for (k = l; k < m; k++) s += u[k][i] * u[k][j];
486
+ /* double division avoids possible underflow */
487
+ f = (s / u[i][i]) / g;
488
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
489
+ }
490
+ }
491
+ for (j = i; j < m; j++) u[j][i] /= g;
492
+ }
493
+ else
494
+ for (j = i; j < m; j++) u[j][i] = 0.0;
495
+ u[i][i] += 1.0;
496
+ }
497
+ /* diagonalization of the bidiagonal form */
498
+ for (k = n-1; k >= 0; k--)
499
+ { k1 = k-1;
500
+ its = 0;
501
+ while(1)
502
+ /* test for splitting */
503
+ { for (l = k; l >= 0; l--)
504
+ { l1 = l-1;
505
+ if (fabs(rv1[l]) + anorm == anorm) break;
506
+ /* rv1[0] is always zero, so there is no exit
507
+ * through the bottom of the loop */
508
+ if (fabs(w[l1]) + anorm == anorm)
509
+ /* cancellation of rv1[l] if l greater than 0 */
510
+ { c = 0.0;
511
+ s = 1.0;
512
+ for (i = l; i <= k; i++)
513
+ { f = s * rv1[i];
514
+ rv1[i] *= c;
515
+ if (fabs(f) + anorm == anorm) break;
516
+ g = w[i];
517
+ h = sqrt(f*f+g*g);
518
+ w[i] = h;
519
+ c = g / h;
520
+ s = -f / h;
521
+ for (j = 0; j < m; j++)
522
+ { y = u[j][l1];
523
+ z = u[j][i];
524
+ u[j][l1] = y * c + z * s;
525
+ u[j][i] = -y * s + z * c;
526
+ }
527
+ }
528
+ break;
529
+ }
530
+ }
531
+ /* test for convergence */
532
+ z = w[k];
533
+ if (l==k) /* convergence */
534
+ { if (z < 0.0)
535
+ /* w[k] is made non-negative */
536
+ { w[k] = -z;
537
+ for (j = 0; j < n; j++) vt[k][j] = -vt[k][j];
538
+ }
539
+ break;
540
+ }
541
+ else if (its==30)
542
+ { ierr = k;
543
+ break;
544
+ }
545
+ else
546
+ /* shift from bottom 2 by 2 minor */
547
+ { its++;
548
+ x = w[l];
549
+ y = w[k1];
550
+ g = rv1[k1];
551
+ h = rv1[k];
552
+ f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
553
+ g = sqrt(f*f+1.0);
554
+ f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
555
+ /* next qr transformation */
556
+ c = 1.0;
557
+ s = 1.0;
558
+ for (i1 = l; i1 <= k1; i1++)
559
+ { i = i1 + 1;
560
+ g = rv1[i];
561
+ y = w[i];
562
+ h = s * g;
563
+ g = c * g;
564
+ z = sqrt(f*f+h*h);
565
+ rv1[i1] = z;
566
+ c = f / z;
567
+ s = h / z;
568
+ f = x * c + g * s;
569
+ g = -x * s + g * c;
570
+ h = y * s;
571
+ y = y * c;
572
+ for (j = 0; j < n; j++)
573
+ { x = vt[i1][j];
574
+ z = vt[i][j];
575
+ vt[i1][j] = x * c + z * s;
576
+ vt[i][j] = -x * s + z * c;
577
+ }
578
+ z = sqrt(f*f+h*h);
579
+ w[i1] = z;
580
+ /* rotation can be arbitrary if z is zero */
581
+ if (z!=0.0)
582
+ { c = f / z;
583
+ s = h / z;
584
+ }
585
+ f = c * g + s * y;
586
+ x = -s * g + c * y;
587
+ for (j = 0; j < m; j++)
588
+ { y = u[j][i1];
589
+ z = u[j][i];
590
+ u[j][i1] = y * c + z * s;
591
+ u[j][i] = -y * s + z * c;
592
+ }
593
+ }
594
+ rv1[l] = 0.0;
595
+ rv1[k] = f;
596
+ w[k] = x;
597
+ }
598
+ }
599
+ }
600
+ }
601
+ else /* m < n */
602
+ { /* Householder reduction to bidiagonal form */
603
+ for (i = 0; i < m; i++)
604
+ { l = i + 1;
605
+ rv1[i] = scale * g;
606
+ g = 0.0;
607
+ s = 0.0;
608
+ scale = 0.0;
609
+ for (k = i; k < n; k++) scale += fabs(u[i][k]);
610
+ if (scale != 0.0)
611
+ { for (k = i; k < n; k++)
612
+ { u[i][k] /= scale;
613
+ s += u[i][k]*u[i][k];
614
+ }
615
+ f = u[i][i];
616
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
617
+ h = f * g - s;
618
+ u[i][i] = f - g;
619
+ if (i < m-1)
620
+ { for (j = l; j < m; j++)
621
+ { s = 0.0;
622
+ for (k = i; k < n; k++) s += u[i][k] * u[j][k];
623
+ f = s / h;
624
+ for (k = i; k < n; k++) u[j][k] += f * u[i][k];
625
+ }
626
+ }
627
+ for (k = i; k < n; k++) u[i][k] *= scale;
628
+ }
629
+ w[i] = scale * g;
630
+ g = 0.0;
631
+ s = 0.0;
632
+ scale = 0.0;
633
+ if (i<m-1)
634
+ { for (k = l; k < m; k++) scale += fabs(u[k][i]);
635
+ if (scale != 0.0)
636
+ { for (k = l; k < m; k++)
637
+ { u[k][i] /= scale;
638
+ s += u[k][i] * u[k][i];
639
+ }
640
+ f = u[l][i];
641
+ g = (f >= 0) ? -sqrt(s) : sqrt(s);
642
+ h = f * g - s;
643
+ u[l][i] = f - g;
644
+ for (k = l; k < m; k++) rv1[k] = u[k][i] / h;
645
+ for (j = l; j < n; j++)
646
+ { s = 0.0;
647
+ for (k = l; k < m; k++) s += u[k][j] * u[k][i];
648
+ for (k = l; k < m; k++) u[k][j] += s * rv1[k];
649
+ }
650
+ for (k = l; k < m; k++) u[k][i] *= scale;
651
+ }
652
+ }
653
+ anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
654
+ }
655
+ /* accumulation of right-hand transformations */
656
+ for (i = m-1; i>=0; i--)
657
+ { if (i < m-1)
658
+ { if (g != 0.0)
659
+ { for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g;
660
+ /* double division avoids possible underflow */
661
+ for (j = l; j < m; j++)
662
+ { s = 0.0;
663
+ for (k = l; k < m; k++) s += u[k][i] * vt[k][j];
664
+ for (k = l; k < m; k++) vt[k][j] += s * vt[k][i];
665
+ }
666
+ }
667
+ }
668
+ for (j = l; j < m; j++)
669
+ { vt[i][j] = 0.0;
670
+ vt[j][i] = 0.0;
671
+ }
672
+ vt[i][i] = 1.0;
673
+ g = rv1[i];
674
+ l = i;
675
+ }
676
+ /* accumulation of left-hand transformations */
677
+ for (i = m-1; i >= 0; i--)
678
+ { l = i + 1;
679
+ g = w[i];
680
+ if (i!=m-1)
681
+ for (j = l; j < m; j++) u[j][i] = 0.0;
682
+ if (g!=0.0)
683
+ { if (i!=m-1)
684
+ { for (j = l; j < m; j++)
685
+ { s = 0.0;
686
+ for (k = l; k < n; k++) s += u[i][k] * u[j][k];
687
+ /* double division avoids possible underflow */
688
+ f = (s / u[i][i]) / g;
689
+ for (k = i; k < n; k++) u[j][k] += f * u[i][k];
690
+ }
691
+ }
692
+ for (j = i; j < n; j++) u[i][j] /= g;
693
+ }
694
+ else
695
+ for (j = i; j < n; j++) u[i][j] = 0.0;
696
+ u[i][i] += 1.0;
697
+ }
698
+ /* diagonalization of the bidiagonal form */
699
+ for (k = m-1; k >= 0; k--)
700
+ { k1 = k-1;
701
+ its = 0;
702
+ while(1)
703
+ /* test for splitting */
704
+ { for (l = k; l >= 0; l--)
705
+ { l1 = l-1;
706
+ if (fabs(rv1[l]) + anorm == anorm) break;
707
+ /* rv1[0] is always zero, so there is no exit
708
+ * through the bottom of the loop */
709
+ if (fabs(w[l1]) + anorm == anorm)
710
+ /* cancellation of rv1[l] if l greater than 0 */
711
+ { c = 0.0;
712
+ s = 1.0;
713
+ for (i = l; i <= k; i++)
714
+ { f = s * rv1[i];
715
+ rv1[i] *= c;
716
+ if (fabs(f) + anorm == anorm) break;
717
+ g = w[i];
718
+ h = sqrt(f*f+g*g);
719
+ w[i] = h;
720
+ c = g / h;
721
+ s = -f / h;
722
+ for (j = 0; j < n; j++)
723
+ { y = u[l1][j];
724
+ z = u[i][j];
725
+ u[l1][j] = y * c + z * s;
726
+ u[i][j] = -y * s + z * c;
727
+ }
728
+ }
729
+ break;
730
+ }
731
+ }
732
+ /* test for convergence */
733
+ z = w[k];
734
+ if (l==k) /* convergence */
735
+ { if (z < 0.0)
736
+ /* w[k] is made non-negative */
737
+ { w[k] = -z;
738
+ for (j = 0; j < m; j++) vt[j][k] = -vt[j][k];
739
+ }
740
+ break;
741
+ }
742
+ else if (its==30)
743
+ { ierr = k;
744
+ break;
745
+ }
746
+ else
747
+ /* shift from bottom 2 by 2 minor */
748
+ { its++;
749
+ x = w[l];
750
+ y = w[k1];
751
+ g = rv1[k1];
752
+ h = rv1[k];
753
+ f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
754
+ g = sqrt(f*f+1.0);
755
+ f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
756
+ /* next qr transformation */
757
+ c = 1.0;
758
+ s = 1.0;
759
+ for (i1 = l; i1 <= k1; i1++)
760
+ { i = i1 + 1;
761
+ g = rv1[i];
762
+ y = w[i];
763
+ h = s * g;
764
+ g = c * g;
765
+ z = sqrt(f*f+h*h);
766
+ rv1[i1] = z;
767
+ c = f / z;
768
+ s = h / z;
769
+ f = x * c + g * s;
770
+ g = -x * s + g * c;
771
+ h = y * s;
772
+ y = y * c;
773
+ for (j = 0; j < m; j++)
774
+ { x = vt[j][i1];
775
+ z = vt[j][i];
776
+ vt[j][i1] = x * c + z * s;
777
+ vt[j][i] = -x * s + z * c;
778
+ }
779
+ z = sqrt(f*f+h*h);
780
+ w[i1] = z;
781
+ /* rotation can be arbitrary if z is zero */
782
+ if (z!=0.0)
783
+ { c = f / z;
784
+ s = h / z;
785
+ }
786
+ f = c * g + s * y;
787
+ x = -s * g + c * y;
788
+ for (j = 0; j < n; j++)
789
+ { y = u[i1][j];
790
+ z = u[i][j];
791
+ u[i1][j] = y * c + z * s;
792
+ u[i][j] = -y * s + z * c;
793
+ }
794
+ }
795
+ rv1[l] = 0.0;
796
+ rv1[k] = f;
797
+ w[k] = x;
798
+ }
799
+ }
800
+ }
801
+ }
802
+ free(rv1);
803
+ return ierr;
804
+ }
805
+
806
+ /* ********************************************************************* */
807
+
808
+ int pca(int nrows, int ncolumns, double** u, double** v, double* w)
809
+ /*
810
+ Purpose
811
+ =======
812
+
813
+ This subroutine uses the singular value decomposition to perform principal
814
+ components analysis of a real nrows by ncolumns rectangular matrix.
815
+
816
+ Arguments
817
+ =========
818
+
819
+ nrows (input) int
820
+ The number of rows in the matrix u.
821
+
822
+ ncolumns (input) int
823
+ The number of columns in the matrix v.
824
+
825
+ u (input) double[nrows][ncolumns]
826
+ On input, the array containing the data to which the principal component
827
+ analysis should be applied. The function assumes that the mean has already been
828
+ subtracted of each column, and hence that the mean of each column is zero.
829
+ On output, see below.
830
+
831
+ v (input) double[n][n], where n = min(nrows, ncolumns)
832
+ Not used on input.
833
+
834
+ w (input) double[n], where n = min(nrows, ncolumns)
835
+ Not used on input.
836
+
837
+
838
+ Return value
839
+ ============
840
+
841
+ On output:
842
+
843
+ If nrows >= ncolumns, then
844
+
845
+ u contains the coordinates with respect to the principal components;
846
+ v contains the principal component vectors.
847
+
848
+ The dot product u . v reproduces the data that were passed in u.
849
+
850
+
851
+ If nrows < ncolumns, then
852
+
853
+ u contains the principal component vectors;
854
+ v contains the coordinates with respect to the principal components.
855
+
856
+ The dot product v . u reproduces the data that were passed in u.
857
+
858
+ The eigenvalues of the covariance matrix are returned in w.
859
+
860
+ The arrays u, v, and w are sorted according to eigenvalue, with the largest
861
+ eigenvalues appearing first.
862
+
863
+ The function returns 0 if successful, -1 if memory allocation fails, and a
864
+ positive integer if the singular value decomposition fails to converge.
865
+ */
866
+ {
867
+ int i;
868
+ int j;
869
+ int error;
870
+ int* index = malloc(ncolumns*sizeof(int));
871
+ double* temp = malloc(ncolumns*sizeof(double));
872
+ if (!index || !temp)
873
+ { if (index) free(index);
874
+ if (temp) free(temp);
875
+ return -1;
876
+ }
877
+ error = svd(nrows, ncolumns, u, w, v);
878
+ if (error==0)
879
+ {
880
+ if (nrows >= ncolumns)
881
+ { for (j = 0; j < ncolumns; j++)
882
+ { const double s = w[j];
883
+ for (i = 0; i < nrows; i++) u[i][j] *= s;
884
+ }
885
+ sort(ncolumns, w, index);
886
+ for (i = 0; i < ncolumns/2; i++)
887
+ { j = index[i];
888
+ index[i] = index[ncolumns-1-i];
889
+ index[ncolumns-1-i] = j;
890
+ }
891
+ for (i = 0; i < nrows; i++)
892
+ { for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]];
893
+ for (j = 0; j < ncolumns; j++) u[i][j] = temp[j];
894
+ }
895
+ for (i = 0; i < ncolumns; i++)
896
+ { for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
897
+ for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
898
+ }
899
+ for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
900
+ for (i = 0; i < ncolumns; i++) w[i] = temp[i];
901
+ }
902
+ else /* nrows < ncolumns */
903
+ { for (j = 0; j < nrows; j++)
904
+ { const double s = w[j];
905
+ for (i = 0; i < nrows; i++) v[i][j] *= s;
906
+ }
907
+ sort(nrows, w, index);
908
+ for (i = 0; i < nrows/2; i++)
909
+ { j = index[i];
910
+ index[i] = index[nrows-1-i];
911
+ index[nrows-1-i] = j;
912
+ }
913
+ for (j = 0; j < ncolumns; j++)
914
+ { for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
915
+ for (i = 0; i < nrows; i++) u[i][j] = temp[i];
916
+ }
917
+ for (j = 0; j < nrows; j++)
918
+ { for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
919
+ for (i = 0; i < nrows; i++) v[j][i] = temp[i];
920
+ }
921
+ for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
922
+ for (i = 0; i < nrows; i++) w[i] = temp[i];
923
+ }
924
+ }
925
+ free(index);
926
+ free(temp);
927
+ return error;
928
+ }
929
+
930
+ /* ********************************************************************* */
931
+
932
+ static
933
+ double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
934
+ const double weight[], int index1, int index2, int transpose)
935
+
936
+ /*
937
+ Purpose
938
+ =======
939
+
940
+ The euclid routine calculates the weighted Euclidean distance between two
941
+ rows or columns in a matrix.
942
+
943
+ Arguments
944
+ =========
945
+
946
+ n (input) int
947
+ The number of elements in a row or column. If transpose==0, then n is the number
948
+ of columns; otherwise, n is the number of rows.
949
+
950
+ data1 (input) double array
951
+ The data array containing the first vector.
952
+
953
+ data2 (input) double array
954
+ The data array containing the second vector.
955
+
956
+ mask1 (input) int array
957
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
958
+ data1[i][j] is missing.
959
+
960
+ mask2 (input) int array
961
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
962
+ data2[i][j] is missing.
963
+
964
+ weight (input) double[n]
965
+ The weights that are used to calculate the distance.
966
+
967
+ index1 (input) int
968
+ Index of the first row or column.
969
+
970
+ index2 (input) int
971
+ Index of the second row or column.
972
+
973
+ transpose (input) int
974
+ If transpose==0, the distance between two rows in the matrix is calculated.
975
+ Otherwise, the distance between two columns in the matrix is calculated.
976
+
977
+ ============================================================================
978
+ */
979
+ { double result = 0.;
980
+ double tweight = 0;
981
+ int i;
982
+ if (transpose==0) /* Calculate the distance between two rows */
983
+ { for (i = 0; i < n; i++)
984
+ { if (mask1[index1][i] && mask2[index2][i])
985
+ { double term = data1[index1][i] - data2[index2][i];
986
+ result += weight[i]*term*term;
987
+ tweight += weight[i];
988
+ }
989
+ }
990
+ }
991
+ else
992
+ { for (i = 0; i < n; i++)
993
+ { if (mask1[i][index1] && mask2[i][index2])
994
+ { double term = data1[i][index1] - data2[i][index2];
995
+ result += weight[i]*term*term;
996
+ tweight += weight[i];
997
+ }
998
+ }
999
+ }
1000
+ if (!tweight) return 0; /* usually due to empty clusters */
1001
+ result /= tweight;
1002
+ return result;
1003
+ }
1004
+
1005
+ /* ********************************************************************* */
1006
+
1007
+ static
1008
+ double cityblock (int n, double** data1, double** data2, int** mask1,
1009
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1010
+
1011
+ /*
1012
+ Purpose
1013
+ =======
1014
+
1015
+ The cityblock routine calculates the weighted "City Block" distance between
1016
+ two rows or columns in a matrix. City Block distance is defined as the
1017
+ absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
1018
+ equivalent to taking an "up and over" path.
1019
+
1020
+ Arguments
1021
+ =========
1022
+
1023
+ n (input) int
1024
+ The number of elements in a row or column. If transpose==0, then n is the number
1025
+ of columns; otherwise, n is the number of rows.
1026
+
1027
+ data1 (input) double array
1028
+ The data array containing the first vector.
1029
+
1030
+ data2 (input) double array
1031
+ The data array containing the second vector.
1032
+
1033
+ mask1 (input) int array
1034
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1035
+ data1[i][j] is missing.
1036
+
1037
+ mask2 (input) int array
1038
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1039
+ data2[i][j] is missing.
1040
+
1041
+ weight (input) double[n]
1042
+ The weights that are used to calculate the distance.
1043
+
1044
+ index1 (input) int
1045
+ Index of the first row or column.
1046
+
1047
+ index2 (input) int
1048
+ Index of the second row or column.
1049
+
1050
+ transpose (input) int
1051
+ If transpose==0, the distance between two rows in the matrix is calculated.
1052
+ Otherwise, the distance between two columns in the matrix is calculated.
1053
+
1054
+ ============================================================================ */
1055
+ { double result = 0.;
1056
+ double tweight = 0;
1057
+ int i;
1058
+ if (transpose==0) /* Calculate the distance between two rows */
1059
+ { for (i = 0; i < n; i++)
1060
+ { if (mask1[index1][i] && mask2[index2][i])
1061
+ { double term = data1[index1][i] - data2[index2][i];
1062
+ result = result + weight[i]*fabs(term);
1063
+ tweight += weight[i];
1064
+ }
1065
+ }
1066
+ }
1067
+ else
1068
+ { for (i = 0; i < n; i++)
1069
+ { if (mask1[i][index1] && mask2[i][index2])
1070
+ { double term = data1[i][index1] - data2[i][index2];
1071
+ result = result + weight[i]*fabs(term);
1072
+ tweight += weight[i];
1073
+ }
1074
+ }
1075
+ }
1076
+ if (!tweight) return 0; /* usually due to empty clusters */
1077
+ result /= tweight;
1078
+ return result;
1079
+ }
1080
+
1081
+ /* ********************************************************************* */
1082
+
1083
+ static
1084
+ double correlation (int n, double** data1, double** data2, int** mask1,
1085
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1086
+ /*
1087
+ Purpose
1088
+ =======
1089
+
1090
+ The correlation routine calculates the weighted Pearson distance between two
1091
+ rows or columns in a matrix. We define the Pearson distance as one minus the
1092
+ Pearson correlation.
1093
+ This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1094
+ but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1095
+ (e.g., choose b = a + c).
1096
+
1097
+ Arguments
1098
+ =========
1099
+
1100
+ n (input) int
1101
+ The number of elements in a row or column. If transpose==0, then n is the number
1102
+ of columns; otherwise, n is the number of rows.
1103
+
1104
+ data1 (input) double array
1105
+ The data array containing the first vector.
1106
+
1107
+ data2 (input) double array
1108
+ The data array containing the second vector.
1109
+
1110
+ mask1 (input) int array
1111
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1112
+ data1[i][j] is missing.
1113
+
1114
+ mask2 (input) int array
1115
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1116
+ data2[i][j] is missing.
1117
+
1118
+ weight (input) double[n]
1119
+ The weights that are used to calculate the distance.
1120
+
1121
+ index1 (input) int
1122
+ Index of the first row or column.
1123
+
1124
+ index2 (input) int
1125
+ Index of the second row or column.
1126
+
1127
+ transpose (input) int
1128
+ If transpose==0, the distance between two rows in the matrix is calculated.
1129
+ Otherwise, the distance between two columns in the matrix is calculated.
1130
+ ============================================================================
1131
+ */
1132
+ { double result = 0.;
1133
+ double sum1 = 0.;
1134
+ double sum2 = 0.;
1135
+ double denom1 = 0.;
1136
+ double denom2 = 0.;
1137
+ double tweight = 0.;
1138
+ if (transpose==0) /* Calculate the distance between two rows */
1139
+ { int i;
1140
+ for (i = 0; i < n; i++)
1141
+ { if (mask1[index1][i] && mask2[index2][i])
1142
+ { double term1 = data1[index1][i];
1143
+ double term2 = data2[index2][i];
1144
+ double w = weight[i];
1145
+ sum1 += w*term1;
1146
+ sum2 += w*term2;
1147
+ result += w*term1*term2;
1148
+ denom1 += w*term1*term1;
1149
+ denom2 += w*term2*term2;
1150
+ tweight += w;
1151
+ }
1152
+ }
1153
+ }
1154
+ else
1155
+ { int i;
1156
+ for (i = 0; i < n; i++)
1157
+ { if (mask1[i][index1] && mask2[i][index2])
1158
+ { double term1 = data1[i][index1];
1159
+ double term2 = data2[i][index2];
1160
+ double w = weight[i];
1161
+ sum1 += w*term1;
1162
+ sum2 += w*term2;
1163
+ result += w*term1*term2;
1164
+ denom1 += w*term1*term1;
1165
+ denom2 += w*term2*term2;
1166
+ tweight += w;
1167
+ }
1168
+ }
1169
+ }
1170
+ if (!tweight) return 0; /* usually due to empty clusters */
1171
+ result -= sum1 * sum2 / tweight;
1172
+ denom1 -= sum1 * sum1 / tweight;
1173
+ denom2 -= sum2 * sum2 / tweight;
1174
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1175
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1176
+ result = result / sqrt(denom1*denom2);
1177
+ result = 1. - result;
1178
+ return result;
1179
+ }
1180
+
1181
+ /* ********************************************************************* */
1182
+
1183
+ static
1184
+ double acorrelation (int n, double** data1, double** data2, int** mask1,
1185
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1186
+ /*
1187
+ Purpose
1188
+ =======
1189
+
1190
+ The acorrelation routine calculates the weighted Pearson distance between two
1191
+ rows or columns, using the absolute value of the correlation.
1192
+ This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1193
+ but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1194
+ (e.g., choose b = a + c).
1195
+
1196
+ Arguments
1197
+ =========
1198
+
1199
+ n (input) int
1200
+ The number of elements in a row or column. If transpose==0, then n is the number
1201
+ of columns; otherwise, n is the number of rows.
1202
+
1203
+ data1 (input) double array
1204
+ The data array containing the first vector.
1205
+
1206
+ data2 (input) double array
1207
+ The data array containing the second vector.
1208
+
1209
+ mask1 (input) int array
1210
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1211
+ data1[i][j] is missing.
1212
+
1213
+ mask2 (input) int array
1214
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1215
+ data2[i][j] is missing.
1216
+
1217
+ weight (input) double[n]
1218
+ The weights that are used to calculate the distance.
1219
+
1220
+ index1 (input) int
1221
+ Index of the first row or column.
1222
+
1223
+ index2 (input) int
1224
+ Index of the second row or column.
1225
+
1226
+ transpose (input) int
1227
+ If transpose==0, the distance between two rows in the matrix is calculated.
1228
+ Otherwise, the distance between two columns in the matrix is calculated.
1229
+ ============================================================================
1230
+ */
1231
+ { double result = 0.;
1232
+ double sum1 = 0.;
1233
+ double sum2 = 0.;
1234
+ double denom1 = 0.;
1235
+ double denom2 = 0.;
1236
+ double tweight = 0.;
1237
+ if (transpose==0) /* Calculate the distance between two rows */
1238
+ { int i;
1239
+ for (i = 0; i < n; i++)
1240
+ { if (mask1[index1][i] && mask2[index2][i])
1241
+ { double term1 = data1[index1][i];
1242
+ double term2 = data2[index2][i];
1243
+ double w = weight[i];
1244
+ sum1 += w*term1;
1245
+ sum2 += w*term2;
1246
+ result += w*term1*term2;
1247
+ denom1 += w*term1*term1;
1248
+ denom2 += w*term2*term2;
1249
+ tweight += w;
1250
+ }
1251
+ }
1252
+ }
1253
+ else
1254
+ { int i;
1255
+ for (i = 0; i < n; i++)
1256
+ { if (mask1[i][index1] && mask2[i][index2])
1257
+ { double term1 = data1[i][index1];
1258
+ double term2 = data2[i][index2];
1259
+ double w = weight[i];
1260
+ sum1 += w*term1;
1261
+ sum2 += w*term2;
1262
+ result += w*term1*term2;
1263
+ denom1 += w*term1*term1;
1264
+ denom2 += w*term2*term2;
1265
+ tweight += w;
1266
+ }
1267
+ }
1268
+ }
1269
+ if (!tweight) return 0; /* usually due to empty clusters */
1270
+ result -= sum1 * sum2 / tweight;
1271
+ denom1 -= sum1 * sum1 / tweight;
1272
+ denom2 -= sum2 * sum2 / tweight;
1273
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1274
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1275
+ result = fabs(result) / sqrt(denom1*denom2);
1276
+ result = 1. - result;
1277
+ return result;
1278
+ }
1279
+
1280
+ /* ********************************************************************* */
1281
+
1282
+ static
1283
+ double ucorrelation (int n, double** data1, double** data2, int** mask1,
1284
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1285
+ /*
1286
+ Purpose
1287
+ =======
1288
+
1289
+ The ucorrelation routine calculates the weighted Pearson distance between two
1290
+ rows or columns, using the uncentered version of the Pearson correlation. In the
1291
+ uncentered Pearson correlation, a zero mean is used for both vectors even if
1292
+ the actual mean is nonzero.
1293
+ This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1294
+ but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1295
+ (e.g., choose b = a + c).
1296
+
1297
+ Arguments
1298
+ =========
1299
+
1300
+ n (input) int
1301
+ The number of elements in a row or column. If transpose==0, then n is the number
1302
+ of columns; otherwise, n is the number of rows.
1303
+
1304
+ data1 (input) double array
1305
+ The data array containing the first vector.
1306
+
1307
+ data2 (input) double array
1308
+ The data array containing the second vector.
1309
+
1310
+ mask1 (input) int array
1311
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1312
+ data1[i][j] is missing.
1313
+
1314
+ mask2 (input) int array
1315
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1316
+ data2[i][j] is missing.
1317
+
1318
+ weight (input) double[n]
1319
+ The weights that are used to calculate the distance.
1320
+
1321
+ index1 (input) int
1322
+ Index of the first row or column.
1323
+
1324
+ index2 (input) int
1325
+ Index of the second row or column.
1326
+
1327
+ transpose (input) int
1328
+ If transpose==0, the distance between two rows in the matrix is calculated.
1329
+ Otherwise, the distance between two columns in the matrix is calculated.
1330
+ ============================================================================
1331
+ */
1332
+ { double result = 0.;
1333
+ double denom1 = 0.;
1334
+ double denom2 = 0.;
1335
+ int flag = 0;
1336
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1337
+ * found.
1338
+ */
1339
+ if (transpose==0) /* Calculate the distance between two rows */
1340
+ { int i;
1341
+ for (i = 0; i < n; i++)
1342
+ { if (mask1[index1][i] && mask2[index2][i])
1343
+ { double term1 = data1[index1][i];
1344
+ double term2 = data2[index2][i];
1345
+ double w = weight[i];
1346
+ result += w*term1*term2;
1347
+ denom1 += w*term1*term1;
1348
+ denom2 += w*term2*term2;
1349
+ flag = 1;
1350
+ }
1351
+ }
1352
+ }
1353
+ else
1354
+ { int i;
1355
+ for (i = 0; i < n; i++)
1356
+ { if (mask1[i][index1] && mask2[i][index2])
1357
+ { double term1 = data1[i][index1];
1358
+ double term2 = data2[i][index2];
1359
+ double w = weight[i];
1360
+ result += w*term1*term2;
1361
+ denom1 += w*term1*term1;
1362
+ denom2 += w*term2*term2;
1363
+ flag = 1;
1364
+ }
1365
+ }
1366
+ }
1367
+ if (!flag) return 0.;
1368
+ if (denom1==0.) return 1.;
1369
+ if (denom2==0.) return 1.;
1370
+ result = result / sqrt(denom1*denom2);
1371
+ result = 1. - result;
1372
+ return result;
1373
+ }
1374
+
1375
+ /* ********************************************************************* */
1376
+
1377
+ static
1378
+ double uacorrelation (int n, double** data1, double** data2, int** mask1,
1379
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1380
+ /*
1381
+ Purpose
1382
+ =======
1383
+
1384
+ The uacorrelation routine calculates the weighted Pearson distance between two
1385
+ rows or columns, using the absolute value of the uncentered version of the
1386
+ Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
1387
+ for both vectors even if the actual mean is nonzero.
1388
+ This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1389
+ but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1390
+ (e.g., choose b = a + c).
1391
+
1392
+ Arguments
1393
+ =========
1394
+
1395
+ n (input) int
1396
+ The number of elements in a row or column. If transpose==0, then n is the number
1397
+ of columns; otherwise, n is the number of rows.
1398
+
1399
+ data1 (input) double array
1400
+ The data array containing the first vector.
1401
+
1402
+ data2 (input) double array
1403
+ The data array containing the second vector.
1404
+
1405
+ mask1 (input) int array
1406
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1407
+ data1[i][j] is missing.
1408
+
1409
+ mask2 (input) int array
1410
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1411
+ data2[i][j] is missing.
1412
+
1413
+ weight (input) double[n]
1414
+ The weights that are used to calculate the distance.
1415
+
1416
+ index1 (input) int
1417
+ Index of the first row or column.
1418
+
1419
+ index2 (input) int
1420
+ Index of the second row or column.
1421
+
1422
+ transpose (input) int
1423
+ If transpose==0, the distance between two rows in the matrix is calculated.
1424
+ Otherwise, the distance between two columns in the matrix is calculated.
1425
+ ============================================================================
1426
+ */
1427
+ { double result = 0.;
1428
+ double denom1 = 0.;
1429
+ double denom2 = 0.;
1430
+ int flag = 0;
1431
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1432
+ * found.
1433
+ */
1434
+ if (transpose==0) /* Calculate the distance between two rows */
1435
+ { int i;
1436
+ for (i = 0; i < n; i++)
1437
+ { if (mask1[index1][i] && mask2[index2][i])
1438
+ { double term1 = data1[index1][i];
1439
+ double term2 = data2[index2][i];
1440
+ double w = weight[i];
1441
+ result += w*term1*term2;
1442
+ denom1 += w*term1*term1;
1443
+ denom2 += w*term2*term2;
1444
+ flag = 1;
1445
+ }
1446
+ }
1447
+ }
1448
+ else
1449
+ { int i;
1450
+ for (i = 0; i < n; i++)
1451
+ { if (mask1[i][index1] && mask2[i][index2])
1452
+ { double term1 = data1[i][index1];
1453
+ double term2 = data2[i][index2];
1454
+ double w = weight[i];
1455
+ result += w*term1*term2;
1456
+ denom1 += w*term1*term1;
1457
+ denom2 += w*term2*term2;
1458
+ flag = 1;
1459
+ }
1460
+ }
1461
+ }
1462
+ if (!flag) return 0.;
1463
+ if (denom1==0.) return 1.;
1464
+ if (denom2==0.) return 1.;
1465
+ result = fabs(result) / sqrt(denom1*denom2);
1466
+ result = 1. - result;
1467
+ return result;
1468
+ }
1469
+
1470
+ /* ********************************************************************* */
1471
+
1472
+ static
1473
+ double spearman (int n, double** data1, double** data2, int** mask1,
1474
+ int** mask2, const double weight[], int index1, int index2, int transpose)
1475
+ /*
1476
+ Purpose
1477
+ =======
1478
+
1479
+ The spearman routine calculates the Spearman distance between two rows or
1480
+ columns. The Spearman distance is defined as one minus the Spearman rank
1481
+ correlation.
1482
+
1483
+ Arguments
1484
+ =========
1485
+
1486
+ n (input) int
1487
+ The number of elements in a row or column. If transpose==0, then n is the number
1488
+ of columns; otherwise, n is the number of rows.
1489
+
1490
+ data1 (input) double array
1491
+ The data array containing the first vector.
1492
+
1493
+ data2 (input) double array
1494
+ The data array containing the second vector.
1495
+
1496
+ mask1 (input) int array
1497
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1498
+ data1[i][j] is missing.
1499
+
1500
+ mask2 (input) int array
1501
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1502
+ data2[i][j] is missing.
1503
+
1504
+ weight (input) double[n]
1505
+ These weights are ignored, but included for consistency with other distance
1506
+ measures.
1507
+
1508
+ index1 (input) int
1509
+ Index of the first row or column.
1510
+
1511
+ index2 (input) int
1512
+ Index of the second row or column.
1513
+
1514
+ transpose (input) int
1515
+ If transpose==0, the distance between two rows in the matrix is calculated.
1516
+ Otherwise, the distance between two columns in the matrix is calculated.
1517
+ ============================================================================
1518
+ */
1519
+ { int i;
1520
+ int m = 0;
1521
+ double* rank1;
1522
+ double* rank2;
1523
+ double result = 0.;
1524
+ double denom1 = 0.;
1525
+ double denom2 = 0.;
1526
+ double avgrank;
1527
+ double* tdata1;
1528
+ double* tdata2;
1529
+ tdata1 = malloc(n*sizeof(double));
1530
+ if(!tdata1) return 0.0; /* Memory allocation error */
1531
+ tdata2 = malloc(n*sizeof(double));
1532
+ if(!tdata2) /* Memory allocation error */
1533
+ { free(tdata1);
1534
+ return 0.0;
1535
+ }
1536
+ if (transpose==0)
1537
+ { for (i = 0; i < n; i++)
1538
+ { if (mask1[index1][i] && mask2[index2][i])
1539
+ { tdata1[m] = data1[index1][i];
1540
+ tdata2[m] = data2[index2][i];
1541
+ m++;
1542
+ }
1543
+ }
1544
+ }
1545
+ else
1546
+ { for (i = 0; i < n; i++)
1547
+ { if (mask1[i][index1] && mask2[i][index2])
1548
+ { tdata1[m] = data1[i][index1];
1549
+ tdata2[m] = data2[i][index2];
1550
+ m++;
1551
+ }
1552
+ }
1553
+ }
1554
+ if (m==0)
1555
+ { free(tdata1);
1556
+ free(tdata2);
1557
+ return 0;
1558
+ }
1559
+ rank1 = getrank(m, tdata1);
1560
+ free(tdata1);
1561
+ if(!rank1)
1562
+ { free(tdata2);
1563
+ return 0.0; /* Memory allocation error */
1564
+ }
1565
+ rank2 = getrank(m, tdata2);
1566
+ free(tdata2);
1567
+ if(!rank2) /* Memory allocation error */
1568
+ { free(rank1);
1569
+ return 0.0;
1570
+ }
1571
+ avgrank = 0.5*(m-1); /* Average rank */
1572
+ for (i = 0; i < m; i++)
1573
+ { const double value1 = rank1[i];
1574
+ const double value2 = rank2[i];
1575
+ result += value1 * value2;
1576
+ denom1 += value1 * value1;
1577
+ denom2 += value2 * value2;
1578
+ }
1579
+ /* Note: denom1 and denom2 cannot be calculated directly from the number
1580
+ * of elements. If two elements have the same rank, the squared sum of
1581
+ * their ranks will change.
1582
+ */
1583
+ free(rank1);
1584
+ free(rank2);
1585
+ result /= m;
1586
+ denom1 /= m;
1587
+ denom2 /= m;
1588
+ result -= avgrank * avgrank;
1589
+ denom1 -= avgrank * avgrank;
1590
+ denom2 -= avgrank * avgrank;
1591
+ if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1592
+ if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1593
+ result = result / sqrt(denom1*denom2);
1594
+ result = 1. - result;
1595
+ return result;
1596
+ }
1597
+
1598
+ /* ********************************************************************* */
1599
+
1600
+ static
1601
+ double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
1602
+ const double weight[], int index1, int index2, int transpose)
1603
+ /*
1604
+ Purpose
1605
+ =======
1606
+
1607
+ The kendall routine calculates the Kendall distance between two
1608
+ rows or columns. The Kendall distance is defined as one minus Kendall's tau.
1609
+
1610
+ Arguments
1611
+ =========
1612
+
1613
+ n (input) int
1614
+ The number of elements in a row or column. If transpose==0, then n is the number
1615
+ of columns; otherwise, n is the number of rows.
1616
+
1617
+ data1 (input) double array
1618
+ The data array containing the first vector.
1619
+
1620
+ data2 (input) double array
1621
+ The data array containing the second vector.
1622
+
1623
+ mask1 (input) int array
1624
+ This array which elements in data1 are missing. If mask1[i][j]==0, then
1625
+ data1[i][j] is missing.
1626
+
1627
+ mask2 (input) int array
1628
+ This array which elements in data2 are missing. If mask2[i][j]==0, then
1629
+ data2[i][j] is missing.
1630
+
1631
+ weight (input) double[n]
1632
+ These weights are ignored, but included for consistency with other distance
1633
+ measures.
1634
+
1635
+ index1 (input) int
1636
+ Index of the first row or column.
1637
+
1638
+ index2 (input) int
1639
+ Index of the second row or column.
1640
+
1641
+ transpose (input) int
1642
+ If transpose==0, the distance between two rows in the matrix is calculated.
1643
+ Otherwise, the distance between two columns in the matrix is calculated.
1644
+ ============================================================================
1645
+ */
1646
+ { int con = 0;
1647
+ int dis = 0;
1648
+ int exx = 0;
1649
+ int exy = 0;
1650
+ int flag = 0;
1651
+ /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1652
+ * found.
1653
+ */
1654
+ double denomx;
1655
+ double denomy;
1656
+ double tau;
1657
+ int i, j;
1658
+ if (transpose==0)
1659
+ { for (i = 0; i < n; i++)
1660
+ { if (mask1[index1][i] && mask2[index2][i])
1661
+ { for (j = 0; j < i; j++)
1662
+ { if (mask1[index1][j] && mask2[index2][j])
1663
+ { double x1 = data1[index1][i];
1664
+ double x2 = data1[index1][j];
1665
+ double y1 = data2[index2][i];
1666
+ double y2 = data2[index2][j];
1667
+ if (x1 < x2 && y1 < y2) con++;
1668
+ if (x1 > x2 && y1 > y2) con++;
1669
+ if (x1 < x2 && y1 > y2) dis++;
1670
+ if (x1 > x2 && y1 < y2) dis++;
1671
+ if (x1 == x2 && y1 != y2) exx++;
1672
+ if (x1 != x2 && y1 == y2) exy++;
1673
+ flag = 1;
1674
+ }
1675
+ }
1676
+ }
1677
+ }
1678
+ }
1679
+ else
1680
+ { for (i = 0; i < n; i++)
1681
+ { if (mask1[i][index1] && mask2[i][index2])
1682
+ { for (j = 0; j < i; j++)
1683
+ { if (mask1[j][index1] && mask2[j][index2])
1684
+ { double x1 = data1[i][index1];
1685
+ double x2 = data1[j][index1];
1686
+ double y1 = data2[i][index2];
1687
+ double y2 = data2[j][index2];
1688
+ if (x1 < x2 && y1 < y2) con++;
1689
+ if (x1 > x2 && y1 > y2) con++;
1690
+ if (x1 < x2 && y1 > y2) dis++;
1691
+ if (x1 > x2 && y1 < y2) dis++;
1692
+ if (x1 == x2 && y1 != y2) exx++;
1693
+ if (x1 != x2 && y1 == y2) exy++;
1694
+ flag = 1;
1695
+ }
1696
+ }
1697
+ }
1698
+ }
1699
+ }
1700
+ if (!flag) return 0.;
1701
+ denomx = con + dis + exx;
1702
+ denomy = con + dis + exy;
1703
+ if (denomx==0) return 1;
1704
+ if (denomy==0) return 1;
1705
+ tau = (con-dis)/sqrt(denomx*denomy);
1706
+ return 1.-tau;
1707
+ }
1708
+
1709
+ /* ********************************************************************* */
1710
+
1711
+ static double(*setmetric(char dist))
1712
+ (int, double**, double**, int**, int**, const double[], int, int, int)
1713
+ { switch(dist)
1714
+ { case 'e': return &euclid;
1715
+ case 'b': return &cityblock;
1716
+ case 'c': return &correlation;
1717
+ case 'a': return &acorrelation;
1718
+ case 'u': return &ucorrelation;
1719
+ case 'x': return &uacorrelation;
1720
+ case 's': return &spearman;
1721
+ case 'k': return &kendall;
1722
+ default: return &euclid;
1723
+ }
1724
+ return NULL; /* Never get here */
1725
+ }
1726
+
1727
+ /* ********************************************************************* */
1728
+
1729
+ static double uniform(void)
1730
+ /*
1731
+ Purpose
1732
+ =======
1733
+
1734
+ This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
1735
+ and 1.0 are excluded. This random number generator is described in:
1736
+
1737
+ Pierre l'Ecuyer
1738
+ Efficient and Portable Combined Random Number Generators
1739
+ Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749,774.
1740
+
1741
+ The first time this routine is called, it initializes the random number
1742
+ generator using the current time. First, the current epoch time in seconds is
1743
+ used as a seed for the random number generator in the C library. The first two
1744
+ random numbers generated by this generator are used to initialize the random
1745
+ number generator implemented in this routine.
1746
+
1747
+
1748
+ Arguments
1749
+ =========
1750
+
1751
+ None.
1752
+
1753
+
1754
+ Return value
1755
+ ============
1756
+
1757
+ A double-precison number between 0.0 and 1.0.
1758
+ ============================================================================
1759
+ */
1760
+ { int z;
1761
+ static const int m1 = 2147483563;
1762
+ static const int m2 = 2147483399;
1763
+ const double scale = 1.0/m1;
1764
+
1765
+ static int s1 = 0;
1766
+ static int s2 = 0;
1767
+
1768
+ if (s1==0 || s2==0) /* initialize */
1769
+ { unsigned int initseed = (unsigned int) time(0);
1770
+ srand(initseed);
1771
+ s1 = rand();
1772
+ s2 = rand();
1773
+ }
1774
+
1775
+ do
1776
+ { int k;
1777
+ k = s1/53668;
1778
+ s1 = 40014*(s1-k*53668)-k*12211;
1779
+ if (s1 < 0) s1+=m1;
1780
+ k = s2/52774;
1781
+ s2 = 40692*(s2-k*52774)-k*3791;
1782
+ if(s2 < 0) s2+=m2;
1783
+ z = s1-s2;
1784
+ if(z < 1) z+=(m1-1);
1785
+ } while (z==m1); /* To avoid returning 1.0 */
1786
+
1787
+ return z*scale;
1788
+ }
1789
+
1790
+ /* ************************************************************************ */
1791
+
1792
+ static int binomial(int n, double p)
1793
+ /*
1794
+ Purpose
1795
+ =======
1796
+
1797
+ This routine generates a random number between 0 and n inclusive, following
1798
+ the binomial distribution with probability p and n trials. The routine is
1799
+ based on the BTPE algorithm, described in:
1800
+
1801
+ Voratas Kachitvichyanukul and Bruce W. Schmeiser:
1802
+ Binomial Random Variate Generation
1803
+ Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
1804
+
1805
+
1806
+ Arguments
1807
+ =========
1808
+
1809
+ p (input) double
1810
+ The probability of a single event. This probability should be less than or
1811
+ equal to 0.5.
1812
+
1813
+ n (input) int
1814
+ The number of trials.
1815
+
1816
+
1817
+ Return value
1818
+ ============
1819
+
1820
+ An integer drawn from a binomial distribution with parameters (p, n).
1821
+
1822
+ ============================================================================
1823
+ */
1824
+ { const double q = 1 - p;
1825
+ if (n*p < 30.0) /* Algorithm BINV */
1826
+ { const double s = p/q;
1827
+ const double a = (n+1)*s;
1828
+ double r = exp(n*log(q)); /* pow() causes a crash on AIX */
1829
+ int x = 0;
1830
+ double u = uniform();
1831
+ while(1)
1832
+ { if (u < r) return x;
1833
+ u-=r;
1834
+ x++;
1835
+ r *= (a/x)-s;
1836
+ }
1837
+ }
1838
+ else /* Algorithm BTPE */
1839
+ { /* Step 0 */
1840
+ const double fm = n*p + p;
1841
+ const int m = (int) fm;
1842
+ const double p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5;
1843
+ const double xm = m + 0.5;
1844
+ const double xl = xm - p1;
1845
+ const double xr = xm + p1;
1846
+ const double c = 0.134 + 20.5/(15.3+m);
1847
+ const double a = (fm-xl)/(fm-xl*p);
1848
+ const double b = (xr-fm)/(xr*q);
1849
+ const double lambdal = a*(1.0+0.5*a);
1850
+ const double lambdar = b*(1.0+0.5*b);
1851
+ const double p2 = p1*(1+2*c);
1852
+ const double p3 = p2 + c/lambdal;
1853
+ const double p4 = p3 + c/lambdar;
1854
+ while (1)
1855
+ { /* Step 1 */
1856
+ int y;
1857
+ int k;
1858
+ double u = uniform();
1859
+ double v = uniform();
1860
+ u *= p4;
1861
+ if (u <= p1) return (int)(xm-p1*v+u);
1862
+ /* Step 2 */
1863
+ if (u > p2)
1864
+ { /* Step 3 */
1865
+ if (u > p3)
1866
+ { /* Step 4 */
1867
+ y = (int)(xr-log(v)/lambdar);
1868
+ if (y > n) continue;
1869
+ /* Go to step 5 */
1870
+ v = v*(u-p3)*lambdar;
1871
+ }
1872
+ else
1873
+ { y = (int)(xl+log(v)/lambdal);
1874
+ if (y < 0) continue;
1875
+ /* Go to step 5 */
1876
+ v = v*(u-p2)*lambdal;
1877
+ }
1878
+ }
1879
+ else
1880
+ { const double x = xl + (u-p1)/c;
1881
+ v = v*c + 1.0 - fabs(m-x+0.5)/p1;
1882
+ if (v > 1) continue;
1883
+ /* Go to step 5 */
1884
+ y = (int)x;
1885
+ }
1886
+ /* Step 5 */
1887
+ /* Step 5.0 */
1888
+ k = abs(y-m);
1889
+ if (k > 20 && k < 0.5*n*p*q-1.0)
1890
+ { /* Step 5.2 */
1891
+ double rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) + 0.1666666666666)/(n*p*q)+0.5);
1892
+ double t = -k*k/(2*n*p*q);
1893
+ double A = log(v);
1894
+ if (A < t-rho) return y;
1895
+ else if (A > t+rho) continue;
1896
+ else
1897
+ { /* Step 5.3 */
1898
+ double x1 = y+1;
1899
+ double f1 = m+1;
1900
+ double z = n+1-m;
1901
+ double w = n-y+1;
1902
+ double x2 = x1*x1;
1903
+ double f2 = f1*f1;
1904
+ double z2 = z*z;
1905
+ double w2 = w*w;
1906
+ if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
1907
+ + (y-m)*log(w*p/(x1*q))
1908
+ + (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
1909
+ + (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
1910
+ + (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
1911
+ + (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
1912
+ continue;
1913
+ return y;
1914
+ }
1915
+ }
1916
+ else
1917
+ { /* Step 5.1 */
1918
+ int i;
1919
+ const double s = p/q;
1920
+ const double aa = s*(n+1);
1921
+ double f = 1.0;
1922
+ for (i = m; i < y; f *= (aa/(++i)-s));
1923
+ for (i = y; i < m; f /= (aa/(++i)-s));
1924
+ if (v > f) continue;
1925
+ return y;
1926
+ }
1927
+ }
1928
+ }
1929
+ /* Never get here */
1930
+ return -1;
1931
+ }
1932
+
1933
+ /* ************************************************************************ */
1934
+
1935
+ static void randomassign (int nclusters, int nelements, int clusterid[])
1936
+ /*
1937
+ Purpose
1938
+ =======
1939
+
1940
+ The randomassign routine performs an initial random clustering, needed for
1941
+ k-means or k-median clustering. Elements (genes or microarrays) are randomly
1942
+ assigned to clusters. The number of elements in each cluster is chosen
1943
+ randomly, making sure that each cluster will receive at least one element.
1944
+
1945
+
1946
+ Arguments
1947
+ =========
1948
+
1949
+ nclusters (input) int
1950
+ The number of clusters.
1951
+
1952
+ nelements (input) int
1953
+ The number of elements to be clustered (i.e., the number of genes or microarrays
1954
+ to be clustered).
1955
+
1956
+ clusterid (output) int[nelements]
1957
+ The cluster number to which an element was assigned.
1958
+
1959
+ ============================================================================
1960
+ */
1961
+ { int i, j;
1962
+ int k = 0;
1963
+ double p;
1964
+ int n = nelements-nclusters;
1965
+ /* Draw the number of elements in each cluster from a multinomial
1966
+ * distribution, reserving ncluster elements to set independently
1967
+ * in order to guarantee that none of the clusters are empty.
1968
+ */
1969
+ for (i = 0; i < nclusters-1; i++)
1970
+ { p = 1.0/(nclusters-i);
1971
+ j = binomial(n, p);
1972
+ n -= j;
1973
+ j += k+1; /* Assign at least one element to cluster i */
1974
+ for ( ; k < j; k++) clusterid[k] = i;
1975
+ }
1976
+ /* Assign the remaining elements to the last cluster */
1977
+ for ( ; k < nelements; k++) clusterid[k] = i;
1978
+
1979
+ /* Create a random permutation of the cluster assignments */
1980
+ for (i = 0; i < nelements; i++)
1981
+ { j = (int) (i + (nelements-i)*uniform());
1982
+ k = clusterid[j];
1983
+ clusterid[j] = clusterid[i];
1984
+ clusterid[i] = k;
1985
+ }
1986
+
1987
+ return;
1988
+ }
1989
+
1990
+ /* ********************************************************************* */
1991
+
1992
+ static void getclustermeans(int nclusters, int nrows, int ncolumns,
1993
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
1994
+ int transpose)
1995
+ /*
1996
+ Purpose
1997
+ =======
1998
+
1999
+ The getclustermeans routine calculates the cluster centroids, given to which
2000
+ cluster each element belongs. The centroid is defined as the mean over all
2001
+ elements for each dimension.
2002
+
2003
+ Arguments
2004
+ =========
2005
+
2006
+ nclusters (input) int
2007
+ The number of clusters.
2008
+
2009
+ nrows (input) int
2010
+ The number of rows in the gene expression data matrix, equal to the number of
2011
+ genes.
2012
+
2013
+ ncolumns (input) int
2014
+ The number of columns in the gene expression data matrix, equal to the number of
2015
+ microarrays.
2016
+
2017
+ data (input) double[nrows][ncolumns]
2018
+ The array containing the gene expression data.
2019
+
2020
+ mask (input) int[nrows][ncolumns]
2021
+ This array shows which data values are missing. If mask[i][j]==0, then
2022
+ data[i][j] is missing.
2023
+
2024
+ clusterid (output) int[nrows] if transpose==0
2025
+ int[ncolumns] if transpose==1
2026
+ The cluster number to which each element belongs. If transpose==0, then the
2027
+ dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2028
+ is equal to ncolumns (the number of microarrays).
2029
+
2030
+ cdata (output) double[nclusters][ncolumns] if transpose==0
2031
+ double[nrows][nclusters] if transpose==1
2032
+ On exit of getclustermeans, this array contains the cluster centroids.
2033
+
2034
+ cmask (output) int[nclusters][ncolumns] if transpose==0
2035
+ int[nrows][nclusters] if transpose==1
2036
+ This array shows which data values of are missing for each centroid. If
2037
+ cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2038
+ a centroid if all corresponding data values of the cluster members are missing.
2039
+
2040
+ transpose (input) int
2041
+ If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2042
+ columns (microarrays) are specified.
2043
+
2044
+ ========================================================================
2045
+ */
2046
+ { int i, j, k;
2047
+ if (transpose==0)
2048
+ { for (i = 0; i < nclusters; i++)
2049
+ { for (j = 0; j < ncolumns; j++)
2050
+ { cmask[i][j] = 0;
2051
+ cdata[i][j] = 0.;
2052
+ }
2053
+ }
2054
+ for (k = 0; k < nrows; k++)
2055
+ { i = clusterid[k];
2056
+ for (j = 0; j < ncolumns; j++)
2057
+ { if (mask[k][j] != 0)
2058
+ { cdata[i][j]+=data[k][j];
2059
+ cmask[i][j]++;
2060
+ }
2061
+ }
2062
+ }
2063
+ for (i = 0; i < nclusters; i++)
2064
+ { for (j = 0; j < ncolumns; j++)
2065
+ { if (cmask[i][j]>0)
2066
+ { cdata[i][j] /= cmask[i][j];
2067
+ cmask[i][j] = 1;
2068
+ }
2069
+ }
2070
+ }
2071
+ }
2072
+ else
2073
+ { for (i = 0; i < nrows; i++)
2074
+ { for (j = 0; j < nclusters; j++)
2075
+ { cdata[i][j] = 0.;
2076
+ cmask[i][j] = 0;
2077
+ }
2078
+ }
2079
+ for (k = 0; k < ncolumns; k++)
2080
+ { i = clusterid[k];
2081
+ for (j = 0; j < nrows; j++)
2082
+ { if (mask[j][k] != 0)
2083
+ { cdata[j][i]+=data[j][k];
2084
+ cmask[j][i]++;
2085
+ }
2086
+ }
2087
+ }
2088
+ for (i = 0; i < nrows; i++)
2089
+ { for (j = 0; j < nclusters; j++)
2090
+ { if (cmask[i][j]>0)
2091
+ { cdata[i][j] /= cmask[i][j];
2092
+ cmask[i][j] = 1;
2093
+ }
2094
+ }
2095
+ }
2096
+ }
2097
+ }
2098
+
2099
+ /* ********************************************************************* */
2100
+
2101
+ static void
2102
+ getclustermedians(int nclusters, int nrows, int ncolumns,
2103
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
2104
+ int transpose, double cache[])
2105
+ /*
2106
+ Purpose
2107
+ =======
2108
+
2109
+ The getclustermedians routine calculates the cluster centroids, given to which
2110
+ cluster each element belongs. The centroid is defined as the median over all
2111
+ elements for each dimension.
2112
+
2113
+ Arguments
2114
+ =========
2115
+
2116
+ nclusters (input) int
2117
+ The number of clusters.
2118
+
2119
+ nrows (input) int
2120
+ The number of rows in the gene expression data matrix, equal to the number of
2121
+ genes.
2122
+
2123
+ ncolumns (input) int
2124
+ The number of columns in the gene expression data matrix, equal to the number of
2125
+ microarrays.
2126
+
2127
+ data (input) double[nrows][ncolumns]
2128
+ The array containing the gene expression data.
2129
+
2130
+ mask (input) int[nrows][ncolumns]
2131
+ This array shows which data values are missing. If mask[i][j]==0, then
2132
+ data[i][j] is missing.
2133
+
2134
+ clusterid (output) int[nrows] if transpose==0
2135
+ int[ncolumns] if transpose==1
2136
+ The cluster number to which each element belongs. If transpose==0, then the
2137
+ dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2138
+ is equal to ncolumns (the number of microarrays).
2139
+
2140
+ cdata (output) double[nclusters][ncolumns] if transpose==0
2141
+ double[nrows][nclusters] if transpose==1
2142
+ On exit of getclustermedians, this array contains the cluster centroids.
2143
+
2144
+ cmask (output) int[nclusters][ncolumns] if transpose==0
2145
+ int[nrows][nclusters] if transpose==1
2146
+ This array shows which data values of are missing for each centroid. If
2147
+ cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2148
+ a centroid if all corresponding data values of the cluster members are missing.
2149
+
2150
+ transpose (input) int
2151
+ If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2152
+ columns (microarrays) are specified.
2153
+
2154
+ cache (input) double[nrows] if transpose==0
2155
+ double[ncolumns] if transpose==1
2156
+ This array should be allocated before calling getclustermedians; its contents
2157
+ on input is not relevant. This array is used as a temporary storage space when
2158
+ calculating the medians.
2159
+
2160
+ ========================================================================
2161
+ */
2162
+ { int i, j, k;
2163
+ if (transpose==0)
2164
+ { for (i = 0; i < nclusters; i++)
2165
+ { for (j = 0; j < ncolumns; j++)
2166
+ { int count = 0;
2167
+ for (k = 0; k < nrows; k++)
2168
+ { if (i==clusterid[k] && mask[k][j])
2169
+ { cache[count] = data[k][j];
2170
+ count++;
2171
+ }
2172
+ }
2173
+ if (count>0)
2174
+ { cdata[i][j] = median(count,cache);
2175
+ cmask[i][j] = 1;
2176
+ }
2177
+ else
2178
+ { cdata[i][j] = 0.;
2179
+ cmask[i][j] = 0;
2180
+ }
2181
+ }
2182
+ }
2183
+ }
2184
+ else
2185
+ { for (i = 0; i < nclusters; i++)
2186
+ { for (j = 0; j < nrows; j++)
2187
+ { int count = 0;
2188
+ for (k = 0; k < ncolumns; k++)
2189
+ { if (i==clusterid[k] && mask[j][k])
2190
+ { cache[count] = data[j][k];
2191
+ count++;
2192
+ }
2193
+ }
2194
+ if (count>0)
2195
+ { cdata[j][i] = median(count,cache);
2196
+ cmask[j][i] = 1;
2197
+ }
2198
+ else
2199
+ { cdata[j][i] = 0.;
2200
+ cmask[j][i] = 0;
2201
+ }
2202
+ }
2203
+ }
2204
+ }
2205
+ }
2206
+
2207
+ /* ********************************************************************* */
2208
+
2209
+ int getclustercentroids(int nclusters, int nrows, int ncolumns,
2210
+ double** data, int** mask, int clusterid[], double** cdata, int** cmask,
2211
+ int transpose, char method)
2212
+ /*
2213
+ Purpose
2214
+ =======
2215
+
2216
+ The getclustercentroids routine calculates the cluster centroids, given to
2217
+ which cluster each element belongs. Depending on the argument method, the
2218
+ centroid is defined as either the mean or the median for each dimension over
2219
+ all elements belonging to a cluster.
2220
+
2221
+ Arguments
2222
+ =========
2223
+
2224
+ nclusters (input) int
2225
+ The number of clusters.
2226
+
2227
+ nrows (input) int
2228
+ The number of rows in the gene expression data matrix, equal to the number of
2229
+ genes.
2230
+
2231
+ ncolumns (input) int
2232
+ The number of columns in the gene expression data matrix, equal to the number of
2233
+ microarrays.
2234
+
2235
+ data (input) double[nrows][ncolumns]
2236
+ The array containing the gene expression data.
2237
+
2238
+ mask (input) int[nrows][ncolumns]
2239
+ This array shows which data values are missing. If mask[i][j]==0, then
2240
+ data[i][j] is missing.
2241
+
2242
+ clusterid (output) int[nrows] if transpose==0
2243
+ int[ncolumns] if transpose==1
2244
+ The cluster number to which each element belongs. If transpose==0, then the
2245
+ dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
2246
+ is equal to ncolumns (the number of microarrays).
2247
+
2248
+ cdata (output) double[nclusters][ncolumns] if transpose==0
2249
+ double[nrows][nclusters] if transpose==1
2250
+ On exit of getclustercentroids, this array contains the cluster centroids.
2251
+
2252
+ cmask (output) int[nclusters][ncolumns] if transpose==0
2253
+ int[nrows][nclusters] if transpose==1
2254
+ This array shows which data values of are missing for each centroid. If
2255
+ cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
2256
+ a centroid if all corresponding data values of the cluster members are missing.
2257
+
2258
+ transpose (input) int
2259
+ If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
2260
+ columns (microarrays) are specified.
2261
+
2262
+ method (input) char
2263
+ For method=='a', the centroid is defined as the mean over all elements
2264
+ belonging to a cluster for each dimension.
2265
+ For method=='m', the centroid is defined as the median over all elements
2266
+ belonging to a cluster for each dimension.
2267
+
2268
+ Return value
2269
+ ============
2270
+
2271
+ The function returns an integer to indicate success or failure. If a
2272
+ memory error occurs, or if method is not 'm' or 'a', getclustercentroids
2273
+ returns 0. If successful, getclustercentroids returns 1.
2274
+ ========================================================================
2275
+ */
2276
+ { switch(method)
2277
+ { case 'm':
2278
+ { const int nelements = (transpose==0) ? nrows : ncolumns;
2279
+ double* cache = malloc(nelements*sizeof(double));
2280
+ if (!cache) return 0;
2281
+ getclustermedians(nclusters, nrows, ncolumns, data, mask, clusterid,
2282
+ cdata, cmask, transpose, cache);
2283
+ free(cache);
2284
+ return 1;
2285
+ }
2286
+ case 'a':
2287
+ { getclustermeans(nclusters, nrows, ncolumns, data, mask, clusterid,
2288
+ cdata, cmask, transpose);
2289
+ return 1;
2290
+ }
2291
+ }
2292
+ return 0;
2293
+ }
2294
+
2295
+ /* ********************************************************************* */
2296
+
2297
+ void getclustermedoids(int nclusters, int nelements, double** distance,
2298
+ int clusterid[], int centroids[], double errors[])
2299
+ /*
2300
+ Purpose
2301
+ =======
2302
+
2303
+ The getclustermedoids routine calculates the cluster centroids, given to which
2304
+ cluster each element belongs. The centroid is defined as the element with the
2305
+ smallest sum of distances to the other elements.
2306
+
2307
+ Arguments
2308
+ =========
2309
+
2310
+ nclusters (input) int
2311
+ The number of clusters.
2312
+
2313
+ nelements (input) int
2314
+ The total number of elements.
2315
+
2316
+ distmatrix (input) double array, ragged
2317
+ (number of rows is nelements, number of columns is equal to the row number)
2318
+ The distance matrix. To save space, the distance matrix is given in the
2319
+ form of a ragged array. The distance matrix is symmetric and has zeros
2320
+ on the diagonal. See distancematrix for a description of the content.
2321
+
2322
+ clusterid (output) int[nelements]
2323
+ The cluster number to which each element belongs.
2324
+
2325
+ centroid (output) int[nclusters]
2326
+ The index of the element that functions as the centroid for each cluster.
2327
+
2328
+ errors (output) double[nclusters]
2329
+ The within-cluster sum of distances between the items and the cluster
2330
+ centroid.
2331
+
2332
+ ========================================================================
2333
+ */
2334
+ { int i, j, k;
2335
+ for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
2336
+ for (i = 0; i < nelements; i++)
2337
+ { double d = 0.0;
2338
+ j = clusterid[i];
2339
+ for (k = 0; k < nelements; k++)
2340
+ { if (i==k || clusterid[k]!=j) continue;
2341
+ d += (i < k ? distance[k][i] : distance[i][k]);
2342
+ if (d > errors[j]) break;
2343
+ }
2344
+ if (d < errors[j])
2345
+ { errors[j] = d;
2346
+ centroids[j] = i;
2347
+ }
2348
+ }
2349
+ }
2350
+
2351
+ /* ********************************************************************* */
2352
+
2353
+ static int
2354
+ kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2355
+ double weight[], int transpose, int npass, char dist,
2356
+ double** cdata, int** cmask, int clusterid[], double* error,
2357
+ int tclusterid[], int counts[], int mapping[])
2358
+ { int i, j, k;
2359
+ const int nelements = (transpose==0) ? nrows : ncolumns;
2360
+ const int ndata = (transpose==0) ? ncolumns : nrows;
2361
+ int ifound = 1;
2362
+ int ipass = 0;
2363
+ /* Set the metric function as indicated by dist */
2364
+ double (*metric)
2365
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
2366
+ setmetric(dist);
2367
+
2368
+ /* We save the clustering solution periodically and check if it reappears */
2369
+ int* saved = malloc(nelements*sizeof(int));
2370
+ if (saved==NULL) return -1;
2371
+
2372
+ *error = DBL_MAX;
2373
+
2374
+ do
2375
+ { double total = DBL_MAX;
2376
+ int counter = 0;
2377
+ int period = 10;
2378
+
2379
+ /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2380
+ if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2381
+
2382
+ for (i = 0; i < nclusters; i++) counts[i] = 0;
2383
+ for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2384
+
2385
+ /* Start the loop */
2386
+ while(1)
2387
+ { double previous = total;
2388
+ total = 0.0;
2389
+
2390
+ if (counter % period == 0) /* Save the current cluster assignments */
2391
+ { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2392
+ if (period < INT_MAX / 2) period *= 2;
2393
+ }
2394
+ counter++;
2395
+
2396
+ /* Find the center */
2397
+ getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
2398
+ cdata, cmask, transpose);
2399
+
2400
+ for (i = 0; i < nelements; i++)
2401
+ /* Calculate the distances */
2402
+ { double distance;
2403
+ k = tclusterid[i];
2404
+ if (counts[k]==1) continue;
2405
+ /* No reassignment if that would lead to an empty cluster */
2406
+ /* Treat the present cluster as a special case */
2407
+ distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
2408
+ for (j = 0; j < nclusters; j++)
2409
+ { double tdistance;
2410
+ if (j==k) continue;
2411
+ tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
2412
+ if (tdistance < distance)
2413
+ { distance = tdistance;
2414
+ counts[tclusterid[i]]--;
2415
+ tclusterid[i] = j;
2416
+ counts[j]++;
2417
+ }
2418
+ }
2419
+ total += distance;
2420
+ }
2421
+ if (total>=previous) break;
2422
+ /* total>=previous is FALSE on some machines even if total and previous
2423
+ * are bitwise identical. */
2424
+ for (i = 0; i < nelements; i++)
2425
+ if (saved[i]!=tclusterid[i]) break;
2426
+ if (i==nelements)
2427
+ break; /* Identical solution found; break out of this loop */
2428
+ }
2429
+
2430
+ if (npass<=1)
2431
+ { *error = total;
2432
+ break;
2433
+ }
2434
+
2435
+ for (i = 0; i < nclusters; i++) mapping[i] = -1;
2436
+ for (i = 0; i < nelements; i++)
2437
+ { j = tclusterid[i];
2438
+ k = clusterid[i];
2439
+ if (mapping[k] == -1) mapping[k] = j;
2440
+ else if (mapping[k] != j)
2441
+ { if (total < *error)
2442
+ { ifound = 1;
2443
+ *error = total;
2444
+ for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
2445
+ }
2446
+ break;
2447
+ }
2448
+ }
2449
+ if (i==nelements) ifound++; /* break statement not encountered */
2450
+ } while (++ipass < npass);
2451
+
2452
+ free(saved);
2453
+ return ifound;
2454
+ }
2455
+
2456
+ /* ---------------------------------------------------------------------- */
2457
+
2458
+ static int
2459
+ kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
2460
+ double weight[], int transpose, int npass, char dist,
2461
+ double** cdata, int** cmask, int clusterid[], double* error,
2462
+ int tclusterid[], int counts[], int mapping[], double cache[])
2463
+ { int i, j, k;
2464
+ const int nelements = (transpose==0) ? nrows : ncolumns;
2465
+ const int ndata = (transpose==0) ? ncolumns : nrows;
2466
+ int ifound = 1;
2467
+ int ipass = 0;
2468
+ /* Set the metric function as indicated by dist */
2469
+ double (*metric)
2470
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
2471
+ setmetric(dist);
2472
+
2473
+ /* We save the clustering solution periodically and check if it reappears */
2474
+ int* saved = malloc(nelements*sizeof(int));
2475
+ if (saved==NULL) return -1;
2476
+
2477
+ *error = DBL_MAX;
2478
+
2479
+ do
2480
+ { double total = DBL_MAX;
2481
+ int counter = 0;
2482
+ int period = 10;
2483
+
2484
+ /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2485
+ if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2486
+
2487
+ for (i = 0; i < nclusters; i++) counts[i]=0;
2488
+ for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2489
+
2490
+ /* Start the loop */
2491
+ while(1)
2492
+ { double previous = total;
2493
+ total = 0.0;
2494
+
2495
+ if (counter % period == 0) /* Save the current cluster assignments */
2496
+ { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2497
+ if (period < INT_MAX / 2) period *= 2;
2498
+ }
2499
+ counter++;
2500
+
2501
+ /* Find the center */
2502
+ getclustermedians(nclusters, nrows, ncolumns, data, mask, tclusterid,
2503
+ cdata, cmask, transpose, cache);
2504
+
2505
+ for (i = 0; i < nelements; i++)
2506
+ /* Calculate the distances */
2507
+ { double distance;
2508
+ k = tclusterid[i];
2509
+ if (counts[k]==1) continue;
2510
+ /* No reassignment if that would lead to an empty cluster */
2511
+ /* Treat the present cluster as a special case */
2512
+ distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
2513
+ for (j = 0; j < nclusters; j++)
2514
+ { double tdistance;
2515
+ if (j==k) continue;
2516
+ tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
2517
+ if (tdistance < distance)
2518
+ { distance = tdistance;
2519
+ counts[tclusterid[i]]--;
2520
+ tclusterid[i] = j;
2521
+ counts[j]++;
2522
+ }
2523
+ }
2524
+ total += distance;
2525
+ }
2526
+ if (total>=previous) break;
2527
+ /* total>=previous is FALSE on some machines even if total and previous
2528
+ * are bitwise identical. */
2529
+ for (i = 0; i < nelements; i++)
2530
+ if (saved[i]!=tclusterid[i]) break;
2531
+ if (i==nelements)
2532
+ break; /* Identical solution found; break out of this loop */
2533
+ }
2534
+
2535
+ if (npass<=1)
2536
+ { *error = total;
2537
+ break;
2538
+ }
2539
+
2540
+ for (i = 0; i < nclusters; i++) mapping[i] = -1;
2541
+ for (i = 0; i < nelements; i++)
2542
+ { j = tclusterid[i];
2543
+ k = clusterid[i];
2544
+ if (mapping[k] == -1) mapping[k] = j;
2545
+ else if (mapping[k] != j)
2546
+ { if (total < *error)
2547
+ { ifound = 1;
2548
+ *error = total;
2549
+ for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
2550
+ }
2551
+ break;
2552
+ }
2553
+ }
2554
+ if (i==nelements) ifound++; /* break statement not encountered */
2555
+ } while (++ipass < npass);
2556
+
2557
+ free(saved);
2558
+ return ifound;
2559
+ }
2560
+
2561
+ /* ********************************************************************* */
2562
+
2563
+ void kcluster (int nclusters, int nrows, int ncolumns,
2564
+ double** data, int** mask, double weight[], int transpose,
2565
+ int npass, char method, char dist,
2566
+ int clusterid[], double* error, int* ifound)
2567
+ /*
2568
+ Purpose
2569
+ =======
2570
+
2571
+ The kcluster routine performs k-means or k-median clustering on a given set of
2572
+ elements, using the specified distance measure. The number of clusters is given
2573
+ by the user. Multiple passes are being made to find the optimal clustering
2574
+ solution, each time starting from a different initial clustering.
2575
+
2576
+
2577
+ Arguments
2578
+ =========
2579
+
2580
+ nclusters (input) int
2581
+ The number of clusters to be found.
2582
+
2583
+ data (input) double[nrows][ncolumns]
2584
+ The array containing the data of the elements to be clustered (i.e., the gene
2585
+ expression data).
2586
+
2587
+ mask (input) int[nrows][ncolumns]
2588
+ This array shows which data values are missing. If
2589
+ mask[i][j] == 0, then data[i][j] is missing.
2590
+
2591
+ nrows (input) int
2592
+ The number of rows in the data matrix, equal to the number of genes.
2593
+
2594
+ ncolumns (input) int
2595
+ The number of columns in the data matrix, equal to the number of microarrays.
2596
+
2597
+ weight (input) double[n]
2598
+ The weights that are used to calculate the distance.
2599
+
2600
+ transpose (input) int
2601
+ If transpose==0, the rows of the matrix are clustered. Otherwise, columns
2602
+ of the matrix are clustered.
2603
+
2604
+ npass (input) int
2605
+ The number of times clustering is performed. Clustering is performed npass
2606
+ times, each time starting from a different (random) initial assignment of
2607
+ genes to clusters. The clustering solution with the lowest within-cluster sum
2608
+ of distances is chosen.
2609
+ If npass==0, then the clustering algorithm will be run once, where the initial
2610
+ assignment of elements to clusters is taken from the clusterid array.
2611
+
2612
+ method (input) char
2613
+ Defines whether the arithmetic mean (method=='a') or the median
2614
+ (method=='m') is used to calculate the cluster center.
2615
+
2616
+ dist (input) char
2617
+ Defines which distance measure is used, as given by the table:
2618
+ dist=='e': Euclidean distance
2619
+ dist=='b': City-block distance
2620
+ dist=='c': correlation
2621
+ dist=='a': absolute value of the correlation
2622
+ dist=='u': uncentered correlation
2623
+ dist=='x': absolute uncentered correlation
2624
+ dist=='s': Spearman's rank correlation
2625
+ dist=='k': Kendall's tau
2626
+ For other values of dist, the default (Euclidean distance) is used.
2627
+
2628
+ clusterid (output; input) int[nrows] if transpose==0
2629
+ int[ncolumns] if transpose==1
2630
+ The cluster number to which a gene or microarray was assigned. If npass==0,
2631
+ then on input clusterid contains the initial clustering assignment from which
2632
+ the clustering algorithm starts. On output, it contains the clustering solution
2633
+ that was found.
2634
+
2635
+ error (output) double*
2636
+ The sum of distances to the cluster center of each item in the optimal k-means
2637
+ clustering solution that was found.
2638
+
2639
+ ifound (output) int*
2640
+ The number of times the optimal clustering solution was
2641
+ found. The value of ifound is at least 1; its maximum value is npass. If the
2642
+ number of clusters is larger than the number of elements being clustered,
2643
+ *ifound is set to 0 as an error code. If a memory allocation error occurs,
2644
+ *ifound is set to -1.
2645
+
2646
+ ========================================================================
2647
+ */
2648
+ { const int nelements = (transpose==0) ? nrows : ncolumns;
2649
+ const int ndata = (transpose==0) ? ncolumns : nrows;
2650
+
2651
+ int i;
2652
+ int ok;
2653
+ int* tclusterid;
2654
+ int* mapping = NULL;
2655
+ double** cdata;
2656
+ int** cmask;
2657
+ int* counts;
2658
+
2659
+ if (nelements < nclusters)
2660
+ { *ifound = 0;
2661
+ return;
2662
+ }
2663
+ /* More clusters asked for than elements available */
2664
+
2665
+ *ifound = -1;
2666
+
2667
+ /* This will contain the number of elements in each cluster, which is
2668
+ * needed to check for empty clusters. */
2669
+ counts = malloc(nclusters*sizeof(int));
2670
+ if(!counts) return;
2671
+
2672
+ /* Find out if the user specified an initial clustering */
2673
+ if (npass<=1) tclusterid = clusterid;
2674
+ else
2675
+ { tclusterid = malloc(nelements*sizeof(int));
2676
+ if (!tclusterid)
2677
+ { free(counts);
2678
+ return;
2679
+ }
2680
+ mapping = malloc(nclusters*sizeof(int));
2681
+ if (!mapping)
2682
+ { free(counts);
2683
+ free(tclusterid);
2684
+ return;
2685
+ }
2686
+ for (i = 0; i < nelements; i++) clusterid[i] = 0;
2687
+ }
2688
+
2689
+ /* Allocate space to store the centroid data */
2690
+ if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
2691
+ else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
2692
+ if(!ok)
2693
+ { free(counts);
2694
+ if(npass>1)
2695
+ { free(tclusterid);
2696
+ free(mapping);
2697
+ return;
2698
+ }
2699
+ }
2700
+
2701
+ if (method=='m')
2702
+ { double* cache = malloc(nelements*sizeof(double));
2703
+ if(cache)
2704
+ { *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
2705
+ transpose, npass, dist, cdata, cmask, clusterid, error,
2706
+ tclusterid, counts, mapping, cache);
2707
+ free(cache);
2708
+ }
2709
+ }
2710
+ else
2711
+ *ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
2712
+ transpose, npass, dist, cdata, cmask, clusterid, error,
2713
+ tclusterid, counts, mapping);
2714
+
2715
+ /* Deallocate temporarily used space */
2716
+ if (npass > 1)
2717
+ { free(mapping);
2718
+ free(tclusterid);
2719
+ }
2720
+
2721
+ if (transpose==0) freedatamask(nclusters, cdata, cmask);
2722
+ else freedatamask(ndata, cdata, cmask);
2723
+
2724
+ free(counts);
2725
+ }
2726
+
2727
+ /* *********************************************************************** */
2728
+
2729
+ void kmedoids (int nclusters, int nelements, double** distmatrix,
2730
+ int npass, int clusterid[], double* error, int* ifound)
2731
+ /*
2732
+ Purpose
2733
+ =======
2734
+
2735
+ The kmedoids routine performs k-medoids clustering on a given set of elements,
2736
+ using the distance matrix and the number of clusters passed by the user.
2737
+ Multiple passes are being made to find the optimal clustering solution, each
2738
+ time starting from a different initial clustering.
2739
+
2740
+
2741
+ Arguments
2742
+ =========
2743
+
2744
+ nclusters (input) int
2745
+ The number of clusters to be found.
2746
+
2747
+ nelements (input) int
2748
+ The number of elements to be clustered.
2749
+
2750
+ distmatrix (input) double array, ragged
2751
+ (number of rows is nelements, number of columns is equal to the row number)
2752
+ The distance matrix. To save space, the distance matrix is given in the
2753
+ form of a ragged array. The distance matrix is symmetric and has zeros
2754
+ on the diagonal. See distancematrix for a description of the content.
2755
+
2756
+ npass (input) int
2757
+ The number of times clustering is performed. Clustering is performed npass
2758
+ times, each time starting from a different (random) initial assignment of genes
2759
+ to clusters. The clustering solution with the lowest within-cluster sum of
2760
+ distances is chosen.
2761
+ If npass==0, then the clustering algorithm will be run once, where the initial
2762
+ assignment of elements to clusters is taken from the clusterid array.
2763
+
2764
+ clusterid (output; input) int[nelements]
2765
+ On input, if npass==0, then clusterid contains the initial clustering assignment
2766
+ from which the clustering algorithm starts; all numbers in clusterid should be
2767
+ between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on
2768
+ input.
2769
+ On output, clusterid contains the clustering solution that was found: clusterid
2770
+ contains the number of the cluster to which each item was assigned. On output,
2771
+ the number of a cluster is defined as the item number of the centroid of the
2772
+ cluster.
2773
+
2774
+ error (output) double
2775
+ The sum of distances to the cluster center of each item in the optimal k-medoids
2776
+ clustering solution that was found.
2777
+
2778
+ ifound (output) int
2779
+ If kmedoids is successful: the number of times the optimal clustering solution
2780
+ was found. The value of ifound is at least 1; its maximum value is npass.
2781
+ If the user requested more clusters than elements available, ifound is set
2782
+ to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
2783
+
2784
+ ========================================================================
2785
+ */
2786
+ { int i, j, icluster;
2787
+ int* tclusterid;
2788
+ int* saved;
2789
+ int* centroids;
2790
+ double* errors;
2791
+ int ipass = 0;
2792
+
2793
+ if (nelements < nclusters)
2794
+ { *ifound = 0;
2795
+ return;
2796
+ } /* More clusters asked for than elements available */
2797
+
2798
+ *ifound = -1;
2799
+
2800
+ /* We save the clustering solution periodically and check if it reappears */
2801
+ saved = malloc(nelements*sizeof(int));
2802
+ if (saved==NULL) return;
2803
+
2804
+ centroids = malloc(nclusters*sizeof(int));
2805
+ if(!centroids)
2806
+ { free(saved);
2807
+ return;
2808
+ }
2809
+
2810
+ errors = malloc(nclusters*sizeof(double));
2811
+ if(!errors)
2812
+ { free(saved);
2813
+ free(centroids);
2814
+ return;
2815
+ }
2816
+
2817
+ /* Find out if the user specified an initial clustering */
2818
+ if (npass<=1) tclusterid = clusterid;
2819
+ else
2820
+ { tclusterid = malloc(nelements*sizeof(int));
2821
+ if(!tclusterid)
2822
+ { free(saved);
2823
+ free(centroids);
2824
+ free(errors);
2825
+ return;
2826
+ }
2827
+ }
2828
+
2829
+ *error = DBL_MAX;
2830
+ do /* Start the loop */
2831
+ { double total = DBL_MAX;
2832
+ int counter = 0;
2833
+ int period = 10;
2834
+
2835
+ if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2836
+ while(1)
2837
+ { double previous = total;
2838
+ total = 0.0;
2839
+
2840
+ if (counter % period == 0) /* Save the current cluster assignments */
2841
+ { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2842
+ if (period < INT_MAX / 2) period *= 2;
2843
+ }
2844
+ counter++;
2845
+
2846
+ /* Find the center */
2847
+ getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
2848
+ centroids, errors);
2849
+
2850
+ for (i = 0; i < nelements; i++)
2851
+ /* Find the closest cluster */
2852
+ { double distance = DBL_MAX;
2853
+ for (icluster = 0; icluster < nclusters; icluster++)
2854
+ { double tdistance;
2855
+ j = centroids[icluster];
2856
+ if (i==j)
2857
+ { distance = 0.0;
2858
+ tclusterid[i] = icluster;
2859
+ break;
2860
+ }
2861
+ tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
2862
+ if (tdistance < distance)
2863
+ { distance = tdistance;
2864
+ tclusterid[i] = icluster;
2865
+ }
2866
+ }
2867
+ total += distance;
2868
+ }
2869
+ if (total>=previous) break;
2870
+ /* total>=previous is FALSE on some machines even if total and previous
2871
+ * are bitwise identical. */
2872
+ for (i = 0; i < nelements; i++)
2873
+ if (saved[i]!=tclusterid[i]) break;
2874
+ if (i==nelements)
2875
+ break; /* Identical solution found; break out of this loop */
2876
+ }
2877
+
2878
+ for (i = 0; i < nelements; i++)
2879
+ { if (clusterid[i]!=centroids[tclusterid[i]])
2880
+ { if (total < *error)
2881
+ { *ifound = 1;
2882
+ *error = total;
2883
+ /* Replace by the centroid in each cluster. */
2884
+ for (j = 0; j < nelements; j++)
2885
+ clusterid[j] = centroids[tclusterid[j]];
2886
+ }
2887
+ break;
2888
+ }
2889
+ }
2890
+ if (i==nelements) (*ifound)++; /* break statement not encountered */
2891
+ } while (++ipass < npass);
2892
+
2893
+ /* Deallocate temporarily used space */
2894
+ if (npass > 1) free(tclusterid);
2895
+
2896
+ free(saved);
2897
+ free(centroids);
2898
+ free(errors);
2899
+
2900
+ return;
2901
+ }
2902
+
2903
+ /* ******************************************************************** */
2904
+
2905
+ double** distancematrix (int nrows, int ncolumns, double** data,
2906
+ int** mask, double weights[], char dist, int transpose)
2907
+ /*
2908
+ Purpose
2909
+ =======
2910
+
2911
+ The distancematrix routine calculates the distance matrix between genes or
2912
+ microarrays using their measured gene expression data. Several distance measures
2913
+ can be used. The routine returns a pointer to a ragged array containing the
2914
+ distances between the genes. As the distance matrix is symmetric, with zeros on
2915
+ the diagonal, only the lower triangular half of the distance matrix is saved.
2916
+ The distancematrix routine allocates space for the distance matrix. If the
2917
+ parameter transpose is set to a nonzero value, the distances between the columns
2918
+ (microarrays) are calculated, otherwise distances between the rows (genes) are
2919
+ calculated.
2920
+ If sufficient space in memory cannot be allocated to store the distance matrix,
2921
+ the routine returns a NULL pointer, and all memory allocated so far for the
2922
+ distance matrix is freed.
2923
+
2924
+
2925
+ Arguments
2926
+ =========
2927
+
2928
+ nrows (input) int
2929
+ The number of rows in the gene expression data matrix (i.e., the number of
2930
+ genes)
2931
+
2932
+ ncolumns (input) int
2933
+ The number of columns in the gene expression data matrix (i.e., the number of
2934
+ microarrays)
2935
+
2936
+ data (input) double[nrows][ncolumns]
2937
+ The array containing the gene expression data.
2938
+
2939
+ mask (input) int[nrows][ncolumns]
2940
+ This array shows which data values are missing. If mask[i][j]==0, then
2941
+ data[i][j] is missing.
2942
+
2943
+ weight (input) double[n]
2944
+ The weights that are used to calculate the distance. The length of this vector
2945
+ is equal to the number of columns if the distances between genes are calculated,
2946
+ or the number of rows if the distances between microarrays are calculated.
2947
+
2948
+ dist (input) char
2949
+ Defines which distance measure is used, as given by the table:
2950
+ dist=='e': Euclidean distance
2951
+ dist=='b': City-block distance
2952
+ dist=='c': correlation
2953
+ dist=='a': absolute value of the correlation
2954
+ dist=='u': uncentered correlation
2955
+ dist=='x': absolute uncentered correlation
2956
+ dist=='s': Spearman's rank correlation
2957
+ dist=='k': Kendall's tau
2958
+ For other values of dist, the default (Euclidean distance) is used.
2959
+
2960
+ transpose (input) int
2961
+ If transpose is equal to zero, the distances between the rows is
2962
+ calculated. Otherwise, the distances between the columns is calculated.
2963
+ The former is needed when genes are being clustered; the latter is used
2964
+ when microarrays are being clustered.
2965
+
2966
+ ========================================================================
2967
+ */
2968
+ { /* First determine the size of the distance matrix */
2969
+ const int n = (transpose==0) ? nrows : ncolumns;
2970
+ const int ndata = (transpose==0) ? ncolumns : nrows;
2971
+ int i,j;
2972
+ double** matrix;
2973
+
2974
+ /* Set the metric function as indicated by dist */
2975
+ double (*metric)
2976
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
2977
+ setmetric(dist);
2978
+
2979
+ if (n < 2) return NULL;
2980
+
2981
+ /* Set up the ragged array */
2982
+ matrix = malloc(n*sizeof(double*));
2983
+ if(matrix==NULL) return NULL; /* Not enough memory available */
2984
+ matrix[0] = NULL;
2985
+ /* The zeroth row has zero columns. We allocate it anyway for convenience.*/
2986
+ for (i = 1; i < n; i++)
2987
+ { matrix[i] = malloc(i*sizeof(double));
2988
+ if (matrix[i]==NULL) break; /* Not enough memory available */
2989
+ }
2990
+ if (i < n) /* break condition encountered */
2991
+ { j = i;
2992
+ for (i = 1; i < j; i++) free(matrix[i]);
2993
+ return NULL;
2994
+ }
2995
+
2996
+ /* Calculate the distances and save them in the ragged array */
2997
+ for (i = 1; i < n; i++)
2998
+ for (j = 0; j < i; j++)
2999
+ matrix[i][j]=metric(ndata,data,data,mask,mask,weights,i,j,transpose);
3000
+
3001
+ return matrix;
3002
+ }
3003
+
3004
+ /* ******************************************************************** */
3005
+
3006
+ double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
3007
+ double weights[], int transpose, char dist, double cutoff, double exponent)
3008
+
3009
+ /*
3010
+ Purpose
3011
+ =======
3012
+
3013
+ This function calculates the weights using the weighting scheme proposed by
3014
+ Michael Eisen:
3015
+ w[i] = 1.0 / sum_{j where d[i][j]<cutoff} (1 - d[i][j]/cutoff)^exponent
3016
+ where the cutoff and the exponent are specified by the user.
3017
+
3018
+
3019
+ Arguments
3020
+ =========
3021
+
3022
+ nrows (input) int
3023
+ The number of rows in the gene expression data matrix, equal to the number of
3024
+ genes.
3025
+
3026
+ ncolumns (input) int
3027
+ The number of columns in the gene expression data matrix, equal to the number of
3028
+ microarrays.
3029
+
3030
+ data (input) double[nrows][ncolumns]
3031
+ The array containing the gene expression data.
3032
+
3033
+ mask (input) int[nrows][ncolumns]
3034
+ This array shows which data values are missing. If mask[i][j]==0, then
3035
+ data[i][j] is missing.
3036
+
3037
+ weight (input) int[ncolumns] if transpose==0,
3038
+ int[nrows] if transpose==1
3039
+ The weights that are used to calculate the distance. The length of this vector
3040
+ is ncolumns if gene weights are being clustered, and nrows if microarrays
3041
+ weights are being clustered.
3042
+
3043
+ transpose (input) int
3044
+ If transpose==0, the weights of the rows of the data matrix are calculated.
3045
+ Otherwise, the weights of the columns of the data matrix are calculated.
3046
+
3047
+ dist (input) char
3048
+ Defines which distance measure is used, as given by the table:
3049
+ dist=='e': Euclidean distance
3050
+ dist=='b': City-block distance
3051
+ dist=='c': correlation
3052
+ dist=='a': absolute value of the correlation
3053
+ dist=='u': uncentered correlation
3054
+ dist=='x': absolute uncentered correlation
3055
+ dist=='s': Spearman's rank correlation
3056
+ dist=='k': Kendall's tau
3057
+ For other values of dist, the default (Euclidean distance) is used.
3058
+
3059
+ cutoff (input) double
3060
+ The cutoff to be used to calculate the weights.
3061
+
3062
+ exponent (input) double
3063
+ The exponent to be used to calculate the weights.
3064
+
3065
+
3066
+ Return value
3067
+ ============
3068
+
3069
+ The function returns a pointer to a newly allocated array containing the
3070
+ calculated weights for the rows (if transpose==0) or columns (if
3071
+ transpose==1). If not enough memory could be allocated to store the
3072
+ weights array, the function returns NULL.
3073
+
3074
+ ========================================================================
3075
+ */
3076
+ { int i,j;
3077
+ const int ndata = (transpose==0) ? ncolumns : nrows;
3078
+ const int nelements = (transpose==0) ? nrows : ncolumns;
3079
+
3080
+ /* Set the metric function as indicated by dist */
3081
+ double (*metric)
3082
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
3083
+ setmetric(dist);
3084
+
3085
+ double* result = malloc(nelements*sizeof(double));
3086
+ if (!result) return NULL;
3087
+ memset(result, 0, nelements*sizeof(double));
3088
+
3089
+ for (i = 0; i < nelements; i++)
3090
+ { result[i] += 1.0;
3091
+ for (j = 0; j < i; j++)
3092
+ { const double distance = metric(ndata, data, data, mask, mask, weights,
3093
+ i, j, transpose);
3094
+ if (distance < cutoff)
3095
+ { const double dweight = exp(exponent*log(1-distance/cutoff));
3096
+ /* pow() causes a crash on AIX */
3097
+ result[i] += dweight;
3098
+ result[j] += dweight;
3099
+ }
3100
+ }
3101
+ }
3102
+ for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
3103
+ return result;
3104
+ }
3105
+
3106
+ /* ******************************************************************** */
3107
+
3108
+ void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
3109
+
3110
+ /*
3111
+ Purpose
3112
+ =======
3113
+
3114
+ The cuttree routine takes the output of a hierarchical clustering routine, and
3115
+ divides the elements in the tree structure into clusters based on the
3116
+ hierarchical clustering result. The number of clusters is specified by the user.
3117
+
3118
+ Arguments
3119
+ =========
3120
+
3121
+ nelements (input) int
3122
+ The number of elements that were clustered.
3123
+
3124
+ tree (input) Node[nelements-1]
3125
+ The clustering solution. Each node in the array describes one linking event,
3126
+ with tree[i].left and tree[i].right representig the elements that were joined.
3127
+ The original elements are numbered 0..nelements-1, nodes are numbered
3128
+ -1..-(nelements-1).
3129
+
3130
+ nclusters (input) int
3131
+ The number of clusters to be formed.
3132
+
3133
+ clusterid (output) int[nelements]
3134
+ The number of the cluster to which each element was assigned. Space for this
3135
+ array should be allocated before calling the cuttree routine. If a memory
3136
+ error occured, all elements in clusterid are set to -1.
3137
+
3138
+ ========================================================================
3139
+ */
3140
+ { int i, j, k;
3141
+ int icluster = 0;
3142
+ const int n = nelements-nclusters; /* number of nodes to join */
3143
+ int* nodeid;
3144
+ for (i = nelements-2; i >= n; i--)
3145
+ { k = tree[i].left;
3146
+ if (k>=0)
3147
+ { clusterid[k] = icluster;
3148
+ icluster++;
3149
+ }
3150
+ k = tree[i].right;
3151
+ if (k>=0)
3152
+ { clusterid[k] = icluster;
3153
+ icluster++;
3154
+ }
3155
+ }
3156
+ nodeid = malloc(n*sizeof(int));
3157
+ if(!nodeid)
3158
+ { for (i = 0; i < nelements; i++) clusterid[i] = -1;
3159
+ return;
3160
+ }
3161
+ for (i = 0; i < n; i++) nodeid[i] = -1;
3162
+ for (i = n-1; i >= 0; i--)
3163
+ { if(nodeid[i]<0)
3164
+ { j = icluster;
3165
+ nodeid[i] = j;
3166
+ icluster++;
3167
+ }
3168
+ else j = nodeid[i];
3169
+ k = tree[i].left;
3170
+ if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
3171
+ k = tree[i].right;
3172
+ if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
3173
+ }
3174
+ free(nodeid);
3175
+ return;
3176
+ }
3177
+
3178
+ /* ******************************************************************** */
3179
+
3180
+ static
3181
+ Node* pclcluster (int nrows, int ncolumns, double** data, int** mask,
3182
+ double weight[], double** distmatrix, char dist, int transpose)
3183
+
3184
+ /*
3185
+
3186
+ Purpose
3187
+ =======
3188
+
3189
+ The pclcluster routine performs clustering using pairwise centroid-linking
3190
+ on a given set of gene expression data, using the distance metric given by dist.
3191
+
3192
+ Arguments
3193
+ =========
3194
+
3195
+ nrows (input) int
3196
+ The number of rows in the gene expression data matrix, equal to the number of
3197
+ genes.
3198
+
3199
+ ncolumns (input) int
3200
+ The number of columns in the gene expression data matrix, equal to the number of
3201
+ microarrays.
3202
+
3203
+ data (input) double[nrows][ncolumns]
3204
+ The array containing the gene expression data.
3205
+
3206
+ mask (input) int[nrows][ncolumns]
3207
+ This array shows which data values are missing. If
3208
+ mask[i][j] == 0, then data[i][j] is missing.
3209
+
3210
+ weight (input) double[ncolumns] if transpose==0;
3211
+ double[nrows] if transpose==1
3212
+ The weights that are used to calculate the distance. The length of this vector
3213
+ is ncolumns if genes are being clustered, and nrows if microarrays are being
3214
+ clustered.
3215
+
3216
+ transpose (input) int
3217
+ If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3218
+ of the matrix are clustered.
3219
+
3220
+ dist (input) char
3221
+ Defines which distance measure is used, as given by the table:
3222
+ dist=='e': Euclidean distance
3223
+ dist=='b': City-block distance
3224
+ dist=='c': correlation
3225
+ dist=='a': absolute value of the correlation
3226
+ dist=='u': uncentered correlation
3227
+ dist=='x': absolute uncentered correlation
3228
+ dist=='s': Spearman's rank correlation
3229
+ dist=='k': Kendall's tau
3230
+ For other values of dist, the default (Euclidean distance) is used.
3231
+
3232
+ distmatrix (input) double**
3233
+ The distance matrix. This matrix is precalculated by the calling routine
3234
+ treecluster. The pclcluster routine modifies the contents of distmatrix, but
3235
+ does not deallocate it.
3236
+
3237
+ Return value
3238
+ ============
3239
+
3240
+ A pointer to a newly allocated array of Node structs, describing the
3241
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3242
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3243
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3244
+ structure.
3245
+ If a memory error occurs, pclcluster returns NULL.
3246
+ ========================================================================
3247
+ */
3248
+ { int i, j;
3249
+ const int nelements = (transpose==0) ? nrows : ncolumns;
3250
+ int inode;
3251
+ const int ndata = transpose ? nrows : ncolumns;
3252
+ const int nnodes = nelements - 1;
3253
+
3254
+ /* Set the metric function as indicated by dist */
3255
+ double (*metric)
3256
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
3257
+ setmetric(dist);
3258
+
3259
+ Node* result;
3260
+ double** newdata;
3261
+ int** newmask;
3262
+ int* distid = malloc(nelements*sizeof(int));
3263
+ if(!distid) return NULL;
3264
+ result = malloc(nnodes*sizeof(Node));
3265
+ if(!result)
3266
+ { free(distid);
3267
+ return NULL;
3268
+ }
3269
+ if(!makedatamask(nelements, ndata, &newdata, &newmask))
3270
+ { free(result);
3271
+ free(distid);
3272
+ return NULL;
3273
+ }
3274
+
3275
+ for (i = 0; i < nelements; i++) distid[i] = i;
3276
+ /* To remember which row/column in the distance matrix contains what */
3277
+
3278
+ /* Storage for node data */
3279
+ if (transpose)
3280
+ { for (i = 0; i < nelements; i++)
3281
+ { for (j = 0; j < ndata; j++)
3282
+ { newdata[i][j] = data[j][i];
3283
+ newmask[i][j] = mask[j][i];
3284
+ }
3285
+ }
3286
+ data = newdata;
3287
+ mask = newmask;
3288
+ }
3289
+ else
3290
+ { for (i = 0; i < nelements; i++)
3291
+ { memcpy(newdata[i], data[i], ndata*sizeof(double));
3292
+ memcpy(newmask[i], mask[i], ndata*sizeof(int));
3293
+ }
3294
+ data = newdata;
3295
+ mask = newmask;
3296
+ }
3297
+
3298
+ for (inode = 0; inode < nnodes; inode++)
3299
+ { /* Find the pair with the shortest distance */
3300
+ int is = 1;
3301
+ int js = 0;
3302
+ result[inode].distance = find_closest_pair(nelements-inode, distmatrix, &is, &js);
3303
+ result[inode].left = distid[js];
3304
+ result[inode].right = distid[is];
3305
+
3306
+ /* Make node js the new node */
3307
+ for (i = 0; i < ndata; i++)
3308
+ { data[js][i] = data[js][i]*mask[js][i] + data[is][i]*mask[is][i];
3309
+ mask[js][i] += mask[is][i];
3310
+ if (mask[js][i]) data[js][i] /= mask[js][i];
3311
+ }
3312
+ free(data[is]);
3313
+ free(mask[is]);
3314
+ data[is] = data[nnodes-inode];
3315
+ mask[is] = mask[nnodes-inode];
3316
+
3317
+ /* Fix the distances */
3318
+ distid[is] = distid[nnodes-inode];
3319
+ for (i = 0; i < is; i++)
3320
+ distmatrix[is][i] = distmatrix[nnodes-inode][i];
3321
+ for (i = is + 1; i < nnodes-inode; i++)
3322
+ distmatrix[i][is] = distmatrix[nnodes-inode][i];
3323
+
3324
+ distid[js] = -inode-1;
3325
+ for (i = 0; i < js; i++)
3326
+ distmatrix[js][i] = metric(ndata,data,data,mask,mask,weight,js,i,0);
3327
+ for (i = js + 1; i < nnodes-inode; i++)
3328
+ distmatrix[i][js] = metric(ndata,data,data,mask,mask,weight,js,i,0);
3329
+ }
3330
+
3331
+ /* Free temporarily allocated space */
3332
+ free(data[0]);
3333
+ free(mask[0]);
3334
+ free(data);
3335
+ free(mask);
3336
+ free(distid);
3337
+
3338
+ return result;
3339
+ }
3340
+
3341
+ /* ******************************************************************** */
3342
+
3343
+ static
3344
+ int nodecompare(const void* a, const void* b)
3345
+ /* Helper function for qsort. */
3346
+ { const Node* node1 = (const Node*)a;
3347
+ const Node* node2 = (const Node*)b;
3348
+ const double term1 = node1->distance;
3349
+ const double term2 = node2->distance;
3350
+ if (term1 < term2) return -1;
3351
+ if (term1 > term2) return +1;
3352
+ return 0;
3353
+ }
3354
+
3355
+ /* ---------------------------------------------------------------------- */
3356
+
3357
+ static
3358
+ Node* pslcluster (int nrows, int ncolumns, double** data, int** mask,
3359
+ double weight[], double** distmatrix, char dist, int transpose)
3360
+
3361
+ /*
3362
+
3363
+ Purpose
3364
+ =======
3365
+
3366
+ The pslcluster routine performs single-linkage hierarchical clustering, using
3367
+ either the distance matrix directly, if available, or by calculating the
3368
+ distances from the data array. This implementation is based on the SLINK
3369
+ algorithm, described in:
3370
+ Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
3371
+ cluster method. The Computer Journal, 16(1): 30-34.
3372
+ The output of this algorithm is identical to conventional single-linkage
3373
+ hierarchical clustering, but is much more memory-efficient and faster. Hence,
3374
+ it can be applied to large data sets, for which the conventional single-
3375
+ linkage algorithm fails due to lack of memory.
3376
+
3377
+
3378
+ Arguments
3379
+ =========
3380
+
3381
+ nrows (input) int
3382
+ The number of rows in the gene expression data matrix, equal to the number of
3383
+ genes.
3384
+
3385
+ ncolumns (input) int
3386
+ The number of columns in the gene expression data matrix, equal to the number of
3387
+ microarrays.
3388
+
3389
+ data (input) double[nrows][ncolumns]
3390
+ The array containing the gene expression data.
3391
+
3392
+ mask (input) int[nrows][ncolumns]
3393
+ This array shows which data values are missing. If
3394
+ mask[i][j] == 0, then data[i][j] is missing.
3395
+
3396
+ weight (input) double[n]
3397
+ The weights that are used to calculate the distance. The length of this vector
3398
+ is ncolumns if genes are being clustered, and nrows if microarrays are being
3399
+ clustered.
3400
+
3401
+ transpose (input) int
3402
+ If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3403
+ of the matrix are clustered.
3404
+
3405
+ dist (input) char
3406
+ Defines which distance measure is used, as given by the table:
3407
+ dist=='e': Euclidean distance
3408
+ dist=='b': City-block distance
3409
+ dist=='c': correlation
3410
+ dist=='a': absolute value of the correlation
3411
+ dist=='u': uncentered correlation
3412
+ dist=='x': absolute uncentered correlation
3413
+ dist=='s': Spearman's rank correlation
3414
+ dist=='k': Kendall's tau
3415
+ For other values of dist, the default (Euclidean distance) is used.
3416
+
3417
+ distmatrix (input) double**
3418
+ The distance matrix. If the distance matrix is passed by the calling routine
3419
+ treecluster, it is used by pslcluster to speed up the clustering calculation.
3420
+ The pslcluster routine does not modify the contents of distmatrix, and does
3421
+ not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
3422
+ by the pslcluster routine from the gene expression data (the data and mask
3423
+ arrays) and stored in temporary arrays. If distmatrix is passed, the original
3424
+ gene expression data (specified by the data and mask arguments) are not needed
3425
+ and are therefore ignored.
3426
+
3427
+
3428
+ Return value
3429
+ ============
3430
+
3431
+ A pointer to a newly allocated array of Node structs, describing the
3432
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3433
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3434
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3435
+ structure.
3436
+ If a memory error occurs, pslcluster returns NULL.
3437
+
3438
+ ========================================================================
3439
+ */
3440
+ { int i, j, k;
3441
+ const int nelements = transpose ? ncolumns : nrows;
3442
+ const int nnodes = nelements - 1;
3443
+ int* vector;
3444
+ double* temp;
3445
+ int* index;
3446
+ Node* result;
3447
+ temp = malloc(nnodes*sizeof(double));
3448
+ if(!temp) return NULL;
3449
+ index = malloc(nelements*sizeof(int));
3450
+ if(!index)
3451
+ { free(temp);
3452
+ return NULL;
3453
+ }
3454
+ vector = malloc(nnodes*sizeof(int));
3455
+ if(!vector)
3456
+ { free(index);
3457
+ free(temp);
3458
+ return NULL;
3459
+ }
3460
+ result = malloc(nelements*sizeof(Node));
3461
+ if(!result)
3462
+ { free(vector);
3463
+ free(index);
3464
+ free(temp);
3465
+ return NULL;
3466
+ }
3467
+
3468
+ for (i = 0; i < nnodes; i++) vector[i] = i;
3469
+
3470
+ if(distmatrix)
3471
+ { for (i = 0; i < nrows; i++)
3472
+ { result[i].distance = DBL_MAX;
3473
+ for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
3474
+ for (j = 0; j < i; j++)
3475
+ { k = vector[j];
3476
+ if (result[j].distance >= temp[j])
3477
+ { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3478
+ result[j].distance = temp[j];
3479
+ vector[j] = i;
3480
+ }
3481
+ else if (temp[j] < temp[k]) temp[k] = temp[j];
3482
+ }
3483
+ for (j = 0; j < i; j++)
3484
+ {
3485
+ if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3486
+ }
3487
+ }
3488
+ }
3489
+ else
3490
+ { const int ndata = transpose ? nrows : ncolumns;
3491
+ /* Set the metric function as indicated by dist */
3492
+ double (*metric)
3493
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
3494
+ setmetric(dist);
3495
+
3496
+ for (i = 0; i < nelements; i++)
3497
+ { result[i].distance = DBL_MAX;
3498
+ for (j = 0; j < i; j++) temp[j] =
3499
+ metric(ndata, data, data, mask, mask, weight, i, j, transpose);
3500
+ for (j = 0; j < i; j++)
3501
+ { k = vector[j];
3502
+ if (result[j].distance >= temp[j])
3503
+ { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3504
+ result[j].distance = temp[j];
3505
+ vector[j] = i;
3506
+ }
3507
+ else if (temp[j] < temp[k]) temp[k] = temp[j];
3508
+ }
3509
+ for (j = 0; j < i; j++)
3510
+ if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3511
+ }
3512
+ }
3513
+ free(temp);
3514
+
3515
+ for (i = 0; i < nnodes; i++) result[i].left = i;
3516
+ qsort(result, nnodes, sizeof(Node), nodecompare);
3517
+
3518
+ for (i = 0; i < nelements; i++) index[i] = i;
3519
+ for (i = 0; i < nnodes; i++)
3520
+ { j = result[i].left;
3521
+ k = vector[j];
3522
+ result[i].left = index[j];
3523
+ result[i].right = index[k];
3524
+ index[k] = -i-1;
3525
+ }
3526
+ free(vector);
3527
+ free(index);
3528
+
3529
+ result = realloc(result, nnodes*sizeof(Node));
3530
+
3531
+ return result;
3532
+ }
3533
+ /* ******************************************************************** */
3534
+
3535
+ static Node* pmlcluster (int nelements, double** distmatrix)
3536
+ /*
3537
+
3538
+ Purpose
3539
+ =======
3540
+
3541
+ The pmlcluster routine performs clustering using pairwise maximum- (complete-)
3542
+ linking on the given distance matrix.
3543
+
3544
+ Arguments
3545
+ =========
3546
+
3547
+ nelements (input) int
3548
+ The number of elements to be clustered.
3549
+
3550
+ distmatrix (input) double**
3551
+ The distance matrix, with nelements rows, each row being filled up to the
3552
+ diagonal. The elements on the diagonal are not used, as they are assumed to be
3553
+ zero. The distance matrix will be modified by this routine.
3554
+
3555
+ Return value
3556
+ ============
3557
+
3558
+ A pointer to a newly allocated array of Node structs, describing the
3559
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3560
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3561
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3562
+ structure.
3563
+ If a memory error occurs, pmlcluster returns NULL.
3564
+ ========================================================================
3565
+ */
3566
+ { int j;
3567
+ int n;
3568
+ int* clusterid;
3569
+ Node* result;
3570
+
3571
+ clusterid = malloc(nelements*sizeof(int));
3572
+ if(!clusterid) return NULL;
3573
+ result = malloc((nelements-1)*sizeof(Node));
3574
+ if (!result)
3575
+ { free(clusterid);
3576
+ return NULL;
3577
+ }
3578
+
3579
+ /* Setup a list specifying to which cluster a gene belongs */
3580
+ for (j = 0; j < nelements; j++) clusterid[j] = j;
3581
+
3582
+ for (n = nelements; n > 1; n--)
3583
+ { int is = 1;
3584
+ int js = 0;
3585
+ result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3586
+
3587
+ /* Fix the distances */
3588
+ for (j = 0; j < js; j++)
3589
+ distmatrix[js][j] = max(distmatrix[is][j],distmatrix[js][j]);
3590
+ for (j = js+1; j < is; j++)
3591
+ distmatrix[j][js] = max(distmatrix[is][j],distmatrix[j][js]);
3592
+ for (j = is+1; j < n; j++)
3593
+ distmatrix[j][js] = max(distmatrix[j][is],distmatrix[j][js]);
3594
+
3595
+ for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3596
+ for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3597
+
3598
+ /* Update clusterids */
3599
+ result[nelements-n].left = clusterid[is];
3600
+ result[nelements-n].right = clusterid[js];
3601
+ clusterid[js] = n-nelements-1;
3602
+ clusterid[is] = clusterid[n-1];
3603
+ }
3604
+ free(clusterid);
3605
+
3606
+ return result;
3607
+ }
3608
+
3609
+ /* ******************************************************************* */
3610
+
3611
+ static Node* palcluster (int nelements, double** distmatrix)
3612
+ /*
3613
+ Purpose
3614
+ =======
3615
+
3616
+ The palcluster routine performs clustering using pairwise average
3617
+ linking on the given distance matrix.
3618
+
3619
+ Arguments
3620
+ =========
3621
+
3622
+ nelements (input) int
3623
+ The number of elements to be clustered.
3624
+
3625
+ distmatrix (input) double**
3626
+ The distance matrix, with nelements rows, each row being filled up to the
3627
+ diagonal. The elements on the diagonal are not used, as they are assumed to be
3628
+ zero. The distance matrix will be modified by this routine.
3629
+
3630
+ Return value
3631
+ ============
3632
+
3633
+ A pointer to a newly allocated array of Node structs, describing the
3634
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3635
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3636
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3637
+ structure.
3638
+ If a memory error occurs, palcluster returns NULL.
3639
+ ========================================================================
3640
+ */
3641
+ { int j;
3642
+ int n;
3643
+ int* clusterid;
3644
+ int* number;
3645
+ Node* result;
3646
+
3647
+ clusterid = malloc(nelements*sizeof(int));
3648
+ if(!clusterid) return NULL;
3649
+ number = malloc(nelements*sizeof(int));
3650
+ if(!number)
3651
+ { free(clusterid);
3652
+ return NULL;
3653
+ }
3654
+ result = malloc((nelements-1)*sizeof(Node));
3655
+ if (!result)
3656
+ { free(clusterid);
3657
+ free(number);
3658
+ return NULL;
3659
+ }
3660
+
3661
+ /* Setup a list specifying to which cluster a gene belongs, and keep track
3662
+ * of the number of elements in each cluster (needed to calculate the
3663
+ * average). */
3664
+ for (j = 0; j < nelements; j++)
3665
+ { number[j] = 1;
3666
+ clusterid[j] = j;
3667
+ }
3668
+
3669
+ for (n = nelements; n > 1; n--)
3670
+ { int sum;
3671
+ int is = 1;
3672
+ int js = 0;
3673
+ result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3674
+
3675
+ /* Save result */
3676
+ result[nelements-n].left = clusterid[is];
3677
+ result[nelements-n].right = clusterid[js];
3678
+
3679
+ /* Fix the distances */
3680
+ sum = number[is] + number[js];
3681
+ for (j = 0; j < js; j++)
3682
+ { distmatrix[js][j] = distmatrix[is][j]*number[is]
3683
+ + distmatrix[js][j]*number[js];
3684
+ distmatrix[js][j] /= sum;
3685
+ }
3686
+ for (j = js+1; j < is; j++)
3687
+ { distmatrix[j][js] = distmatrix[is][j]*number[is]
3688
+ + distmatrix[j][js]*number[js];
3689
+ distmatrix[j][js] /= sum;
3690
+ }
3691
+ for (j = is+1; j < n; j++)
3692
+ { distmatrix[j][js] = distmatrix[j][is]*number[is]
3693
+ + distmatrix[j][js]*number[js];
3694
+ distmatrix[j][js] /= sum;
3695
+ }
3696
+
3697
+ for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3698
+ for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3699
+
3700
+ /* Update number of elements in the clusters */
3701
+ number[js] = sum;
3702
+ number[is] = number[n-1];
3703
+
3704
+ /* Update clusterids */
3705
+ clusterid[js] = n-nelements-1;
3706
+ clusterid[is] = clusterid[n-1];
3707
+ }
3708
+ free(clusterid);
3709
+ free(number);
3710
+
3711
+ return result;
3712
+ }
3713
+
3714
+ /* ******************************************************************* */
3715
+
3716
+ Node* treecluster (int nrows, int ncolumns, double** data, int** mask,
3717
+ double weight[], int transpose, char dist, char method, double** distmatrix)
3718
+ /*
3719
+ Purpose
3720
+ =======
3721
+
3722
+ The treecluster routine performs hierarchical clustering using pairwise
3723
+ single-, maximum-, centroid-, or average-linkage, as defined by method, on a
3724
+ given set of gene expression data, using the distance metric given by dist.
3725
+ If successful, the function returns a pointer to a newly allocated Tree struct
3726
+ containing the hierarchical clustering solution, and NULL if a memory error
3727
+ occurs. The pointer should be freed by the calling routine to prevent memory
3728
+ leaks.
3729
+
3730
+ Arguments
3731
+ =========
3732
+
3733
+ nrows (input) int
3734
+ The number of rows in the data matrix, equal to the number of genes.
3735
+
3736
+ ncolumns (input) int
3737
+ The number of columns in the data matrix, equal to the number of microarrays.
3738
+
3739
+ data (input) double[nrows][ncolumns]
3740
+ The array containing the data of the vectors to be clustered.
3741
+
3742
+ mask (input) int[nrows][ncolumns]
3743
+ This array shows which data values are missing. If mask[i][j]==0, then
3744
+ data[i][j] is missing.
3745
+
3746
+ weight (input) double array[n]
3747
+ The weights that are used to calculate the distance.
3748
+
3749
+ transpose (input) int
3750
+ If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3751
+ of the matrix are clustered.
3752
+
3753
+ dist (input) char
3754
+ Defines which distance measure is used, as given by the table:
3755
+ dist=='e': Euclidean distance
3756
+ dist=='b': City-block distance
3757
+ dist=='c': correlation
3758
+ dist=='a': absolute value of the correlation
3759
+ dist=='u': uncentered correlation
3760
+ dist=='x': absolute uncentered correlation
3761
+ dist=='s': Spearman's rank correlation
3762
+ dist=='k': Kendall's tau
3763
+ For other values of dist, the default (Euclidean distance) is used.
3764
+
3765
+ method (input) char
3766
+ Defines which hierarchical clustering method is used:
3767
+ method=='s': pairwise single-linkage clustering
3768
+ method=='m': pairwise maximum- (or complete-) linkage clustering
3769
+ method=='a': pairwise average-linkage clustering
3770
+ method=='c': pairwise centroid-linkage clustering
3771
+ For the first three, either the distance matrix or the gene expression data is
3772
+ sufficient to perform the clustering algorithm. For pairwise centroid-linkage
3773
+ clustering, however, the gene expression data are always needed, even if the
3774
+ distance matrix itself is available.
3775
+
3776
+ distmatrix (input) double**
3777
+ The distance matrix. If the distance matrix is zero initially, the distance
3778
+ matrix will be allocated and calculated from the data by treecluster, and
3779
+ deallocated before treecluster returns. If the distance matrix is passed by the
3780
+ calling routine, treecluster will modify the contents of the distance matrix as
3781
+ part of the clustering algorithm, but will not deallocate it. The calling
3782
+ routine should deallocate the distance matrix after the return from treecluster.
3783
+
3784
+ Return value
3785
+ ============
3786
+
3787
+ A pointer to a newly allocated array of Node structs, describing the
3788
+ hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3789
+ whether genes (rows) or microarrays (columns) were clustered, nelements is
3790
+ equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3791
+ structure.
3792
+ If a memory error occurs, treecluster returns NULL.
3793
+
3794
+ ========================================================================
3795
+ */
3796
+ { Node* result = NULL;
3797
+ const int nelements = (transpose==0) ? nrows : ncolumns;
3798
+ const int ldistmatrix = (distmatrix==NULL && method!='s') ? 1 : 0;
3799
+
3800
+ if (nelements < 2) return NULL;
3801
+
3802
+ /* Calculate the distance matrix if the user didn't give it */
3803
+ if(ldistmatrix)
3804
+ { distmatrix =
3805
+ distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose);
3806
+ if (!distmatrix) return NULL; /* Insufficient memory */
3807
+ }
3808
+
3809
+ switch(method)
3810
+ { case 's':
3811
+ result = pslcluster(nrows, ncolumns, data, mask, weight, distmatrix,
3812
+ dist, transpose);
3813
+ break;
3814
+ case 'm':
3815
+ result = pmlcluster(nelements, distmatrix);
3816
+ break;
3817
+ case 'a':
3818
+ result = palcluster(nelements, distmatrix);
3819
+ break;
3820
+ case 'c':
3821
+ result = pclcluster(nrows, ncolumns, data, mask, weight, distmatrix,
3822
+ dist, transpose);
3823
+ break;
3824
+ }
3825
+
3826
+ /* Deallocate space for distance matrix, if it was allocated by treecluster */
3827
+ if(ldistmatrix)
3828
+ { int i;
3829
+ for (i = 1; i < nelements; i++) free(distmatrix[i]);
3830
+ free (distmatrix);
3831
+ }
3832
+
3833
+ return result;
3834
+ }
3835
+
3836
+ /* ******************************************************************* */
3837
+
3838
+ static
3839
+ void somworker (int nrows, int ncolumns, double** data, int** mask,
3840
+ const double weights[], int transpose, int nxgrid, int nygrid,
3841
+ double inittau, double*** celldata, int niter, char dist)
3842
+
3843
+ { const int nelements = (transpose==0) ? nrows : ncolumns;
3844
+ const int ndata = (transpose==0) ? ncolumns : nrows;
3845
+ int i, j;
3846
+ double* stddata = calloc(nelements,sizeof(double));
3847
+ int** dummymask;
3848
+ int ix, iy;
3849
+ int* index;
3850
+ int iter;
3851
+ /* Maximum radius in which nodes are adjusted */
3852
+ double maxradius = sqrt(nxgrid*nxgrid+nygrid*nygrid);
3853
+
3854
+ /* Set the metric function as indicated by dist */
3855
+ double (*metric)
3856
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
3857
+ setmetric(dist);
3858
+
3859
+ /* Calculate the standard deviation for each row or column */
3860
+ if (transpose==0)
3861
+ { for (i = 0; i < nelements; i++)
3862
+ { int n = 0;
3863
+ for (j = 0; j < ndata; j++)
3864
+ { if (mask[i][j])
3865
+ { double term = data[i][j];
3866
+ term = term * term;
3867
+ stddata[i] += term;
3868
+ n++;
3869
+ }
3870
+ }
3871
+ if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
3872
+ else stddata[i] = 1;
3873
+ }
3874
+ }
3875
+ else
3876
+ { for (i = 0; i < nelements; i++)
3877
+ { int n = 0;
3878
+ for (j = 0; j < ndata; j++)
3879
+ { if (mask[j][i])
3880
+ { double term = data[j][i];
3881
+ term = term * term;
3882
+ stddata[i] += term;
3883
+ n++;
3884
+ }
3885
+ }
3886
+ if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
3887
+ else stddata[i] = 1;
3888
+ }
3889
+ }
3890
+
3891
+ if (transpose==0)
3892
+ { dummymask = malloc(nygrid*sizeof(int*));
3893
+ for (i = 0; i < nygrid; i++)
3894
+ { dummymask[i] = malloc(ndata*sizeof(int));
3895
+ for (j = 0; j < ndata; j++) dummymask[i][j] = 1;
3896
+ }
3897
+ }
3898
+ else
3899
+ { dummymask = malloc(ndata*sizeof(int*));
3900
+ for (i = 0; i < ndata; i++)
3901
+ { dummymask[i] = malloc(sizeof(int));
3902
+ dummymask[i][0] = 1;
3903
+ }
3904
+ }
3905
+
3906
+ /* Randomly initialize the nodes */
3907
+ for (ix = 0; ix < nxgrid; ix++)
3908
+ { for (iy = 0; iy < nygrid; iy++)
3909
+ { double sum = 0.;
3910
+ for (i = 0; i < ndata; i++)
3911
+ { double term = -1.0 + 2.0*uniform();
3912
+ celldata[ix][iy][i] = term;
3913
+ sum += term * term;
3914
+ }
3915
+ sum = sqrt(sum/ndata);
3916
+ for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
3917
+ }
3918
+ }
3919
+
3920
+ /* Randomize the order in which genes or arrays will be used */
3921
+ index = malloc(nelements*sizeof(int));
3922
+ for (i = 0; i < nelements; i++) index[i] = i;
3923
+ for (i = 0; i < nelements; i++)
3924
+ { j = (int) (i + (nelements-i)*uniform());
3925
+ ix = index[j];
3926
+ index[j] = index[i];
3927
+ index[i] = ix;
3928
+ }
3929
+
3930
+ /* Start the iteration */
3931
+ for (iter = 0; iter < niter; iter++)
3932
+ { int ixbest = 0;
3933
+ int iybest = 0;
3934
+ int iobject = iter % nelements;
3935
+ iobject = index[iobject];
3936
+ if (transpose==0)
3937
+ { double closest = metric(ndata,data,celldata[ixbest],
3938
+ mask,dummymask,weights,iobject,iybest,transpose);
3939
+ double radius = maxradius * (1. - ((double)iter)/((double)niter));
3940
+ double tau = inittau * (1. - ((double)iter)/((double)niter));
3941
+
3942
+ for (ix = 0; ix < nxgrid; ix++)
3943
+ { for (iy = 0; iy < nygrid; iy++)
3944
+ { double distance =
3945
+ metric (ndata,data,celldata[ix],
3946
+ mask,dummymask,weights,iobject,iy,transpose);
3947
+ if (distance < closest)
3948
+ { ixbest = ix;
3949
+ iybest = iy;
3950
+ closest = distance;
3951
+ }
3952
+ }
3953
+ }
3954
+ for (ix = 0; ix < nxgrid; ix++)
3955
+ { for (iy = 0; iy < nygrid; iy++)
3956
+ { if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))<radius)
3957
+ { double sum = 0.;
3958
+ for (i = 0; i < ndata; i++)
3959
+ { if (mask[iobject][i]==0) continue;
3960
+ celldata[ix][iy][i] +=
3961
+ tau * (data[iobject][i]/stddata[iobject]-celldata[ix][iy][i]);
3962
+ }
3963
+ for (i = 0; i < ndata; i++)
3964
+ { double term = celldata[ix][iy][i];
3965
+ term = term * term;
3966
+ sum += term;
3967
+ }
3968
+ if (sum>0)
3969
+ { sum = sqrt(sum/ndata);
3970
+ for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
3971
+ }
3972
+ }
3973
+ }
3974
+ }
3975
+ }
3976
+ else
3977
+ { double closest;
3978
+ double** celldatavector = malloc(ndata*sizeof(double*));
3979
+ double radius = maxradius * (1. - ((double)iter)/((double)niter));
3980
+ double tau = inittau * (1. - ((double)iter)/((double)niter));
3981
+
3982
+ for (i = 0; i < ndata; i++)
3983
+ celldatavector[i] = &(celldata[ixbest][iybest][i]);
3984
+ closest = metric(ndata,data,celldatavector,
3985
+ mask,dummymask,weights,iobject,0,transpose);
3986
+ for (ix = 0; ix < nxgrid; ix++)
3987
+ { for (iy = 0; iy < nygrid; iy++)
3988
+ { double distance;
3989
+ for (i = 0; i < ndata; i++)
3990
+ celldatavector[i] = &(celldata[ixbest][iybest][i]);
3991
+ distance =
3992
+ metric (ndata,data,celldatavector,
3993
+ mask,dummymask,weights,iobject,0,transpose);
3994
+ if (distance < closest)
3995
+ { ixbest = ix;
3996
+ iybest = iy;
3997
+ closest = distance;
3998
+ }
3999
+ }
4000
+ }
4001
+ free(celldatavector);
4002
+ for (ix = 0; ix < nxgrid; ix++)
4003
+ { for (iy = 0; iy < nygrid; iy++)
4004
+ { if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))<radius)
4005
+ { double sum = 0.;
4006
+ for (i = 0; i < ndata; i++)
4007
+ { if (mask[i][iobject]==0) continue;
4008
+ celldata[ix][iy][i] +=
4009
+ tau * (data[i][iobject]/stddata[iobject]-celldata[ix][iy][i]);
4010
+ }
4011
+ for (i = 0; i < ndata; i++)
4012
+ { double term = celldata[ix][iy][i];
4013
+ term = term * term;
4014
+ sum += term;
4015
+ }
4016
+ if (sum>0)
4017
+ { sum = sqrt(sum/ndata);
4018
+ for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
4019
+ }
4020
+ }
4021
+ }
4022
+ }
4023
+ }
4024
+ }
4025
+ if (transpose==0)
4026
+ for (i = 0; i < nygrid; i++) free(dummymask[i]);
4027
+ else
4028
+ for (i = 0; i < ndata; i++) free(dummymask[i]);
4029
+ free(dummymask);
4030
+ free(stddata);
4031
+ free(index);
4032
+ return;
4033
+ }
4034
+
4035
+ /* ******************************************************************* */
4036
+
4037
+ static
4038
+ void somassign (int nrows, int ncolumns, double** data, int** mask,
4039
+ const double weights[], int transpose, int nxgrid, int nygrid,
4040
+ double*** celldata, char dist, int clusterid[][2])
4041
+ /* Collect clusterids */
4042
+ { const int ndata = (transpose==0) ? ncolumns : nrows;
4043
+ int i,j;
4044
+
4045
+ /* Set the metric function as indicated by dist */
4046
+ double (*metric)
4047
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
4048
+ setmetric(dist);
4049
+
4050
+ if (transpose==0)
4051
+ { int** dummymask = malloc(nygrid*sizeof(int*));
4052
+ for (i = 0; i < nygrid; i++)
4053
+ { dummymask[i] = malloc(ncolumns*sizeof(int));
4054
+ for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1;
4055
+ }
4056
+ for (i = 0; i < nrows; i++)
4057
+ { int ixbest = 0;
4058
+ int iybest = 0;
4059
+ double closest = metric(ndata,data,celldata[ixbest],
4060
+ mask,dummymask,weights,i,iybest,transpose);
4061
+ int ix, iy;
4062
+ for (ix = 0; ix < nxgrid; ix++)
4063
+ { for (iy = 0; iy < nygrid; iy++)
4064
+ { double distance =
4065
+ metric (ndata,data,celldata[ix],
4066
+ mask,dummymask,weights,i,iy,transpose);
4067
+ if (distance < closest)
4068
+ { ixbest = ix;
4069
+ iybest = iy;
4070
+ closest = distance;
4071
+ }
4072
+ }
4073
+ }
4074
+ clusterid[i][0] = ixbest;
4075
+ clusterid[i][1] = iybest;
4076
+ }
4077
+ for (i = 0; i < nygrid; i++) free(dummymask[i]);
4078
+ free(dummymask);
4079
+ }
4080
+ else
4081
+ { double** celldatavector = malloc(ndata*sizeof(double*));
4082
+ int** dummymask = malloc(nrows*sizeof(int*));
4083
+ int ixbest = 0;
4084
+ int iybest = 0;
4085
+ for (i = 0; i < nrows; i++)
4086
+ { dummymask[i] = malloc(sizeof(int));
4087
+ dummymask[i][0] = 1;
4088
+ }
4089
+ for (i = 0; i < ncolumns; i++)
4090
+ { double closest;
4091
+ int ix, iy;
4092
+ for (j = 0; j < ndata; j++)
4093
+ celldatavector[j] = &(celldata[ixbest][iybest][j]);
4094
+ closest = metric(ndata,data,celldatavector,
4095
+ mask,dummymask,weights,i,0,transpose);
4096
+ for (ix = 0; ix < nxgrid; ix++)
4097
+ { for (iy = 0; iy < nygrid; iy++)
4098
+ { double distance;
4099
+ for(j = 0; j < ndata; j++)
4100
+ celldatavector[j] = &(celldata[ix][iy][j]);
4101
+ distance = metric(ndata,data,celldatavector,
4102
+ mask,dummymask,weights,i,0,transpose);
4103
+ if (distance < closest)
4104
+ { ixbest = ix;
4105
+ iybest = iy;
4106
+ closest = distance;
4107
+ }
4108
+ }
4109
+ }
4110
+ clusterid[i][0] = ixbest;
4111
+ clusterid[i][1] = iybest;
4112
+ }
4113
+ free(celldatavector);
4114
+ for (i = 0; i < nrows; i++) free(dummymask[i]);
4115
+ free(dummymask);
4116
+ }
4117
+ return;
4118
+ }
4119
+
4120
+ /* ******************************************************************* */
4121
+
4122
+ void somcluster (int nrows, int ncolumns, double** data, int** mask,
4123
+ const double weight[], int transpose, int nxgrid, int nygrid,
4124
+ double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
4125
+ /*
4126
+
4127
+ Purpose
4128
+ =======
4129
+
4130
+ The somcluster routine implements a self-organizing map (Kohonen) on a
4131
+ rectangular grid, using a given set of vectors. The distance measure to be
4132
+ used to find the similarity between genes and nodes is given by dist.
4133
+
4134
+ Arguments
4135
+ =========
4136
+
4137
+ nrows (input) int
4138
+ The number of rows in the data matrix, equal to the number of genes.
4139
+
4140
+ ncolumns (input) int
4141
+ The number of columns in the data matrix, equal to the number of microarrays.
4142
+
4143
+ data (input) double[nrows][ncolumns]
4144
+ The array containing the gene expression data.
4145
+
4146
+ mask (input) int[nrows][ncolumns]
4147
+ This array shows which data values are missing. If
4148
+ mask[i][j] == 0, then data[i][j] is missing.
4149
+
4150
+ weights (input) double[ncolumns] if transpose==0;
4151
+ double[nrows] if transpose==1
4152
+ The weights that are used to calculate the distance. The length of this vector
4153
+ is ncolumns if genes are being clustered, or nrows if microarrays are being
4154
+ clustered.
4155
+
4156
+ transpose (input) int
4157
+ If transpose==0, the rows (genes) of the matrix are clustered. Otherwise,
4158
+ columns (microarrays) of the matrix are clustered.
4159
+
4160
+ nxgrid (input) int
4161
+ The number of grid cells horizontally in the rectangular topology of clusters.
4162
+
4163
+ nygrid (input) int
4164
+ The number of grid cells horizontally in the rectangular topology of clusters.
4165
+
4166
+ inittau (input) double
4167
+ The initial value of tau, representing the neighborhood function.
4168
+
4169
+ niter (input) int
4170
+ The number of iterations to be performed.
4171
+
4172
+ dist (input) char
4173
+ Defines which distance measure is used, as given by the table:
4174
+ dist=='e': Euclidean distance
4175
+ dist=='b': City-block distance
4176
+ dist=='c': correlation
4177
+ dist=='a': absolute value of the correlation
4178
+ dist=='u': uncentered correlation
4179
+ dist=='x': absolute uncentered correlation
4180
+ dist=='s': Spearman's rank correlation
4181
+ dist=='k': Kendall's tau
4182
+ For other values of dist, the default (Euclidean distance) is used.
4183
+
4184
+ celldata (output) double[nxgrid][nygrid][ncolumns] if transpose==0;
4185
+ double[nxgrid][nygrid][nrows] if tranpose==1
4186
+ The gene expression data for each node (cell) in the 2D grid. This can be
4187
+ interpreted as the centroid for the cluster corresponding to that cell. If
4188
+ celldata is NULL, then the centroids are not returned. If celldata is not
4189
+ NULL, enough space should be allocated to store the centroid data before callingsomcluster.
4190
+
4191
+ clusterid (output), int[nrows][2] if transpose==0;
4192
+ int[ncolumns][2] if transpose==1
4193
+ For each item (gene or microarray) that is clustered, the coordinates of the
4194
+ cell in the 2D grid to which the item was assigned. If clusterid is NULL, the
4195
+ cluster assignments are not returned. If clusterid is not NULL, enough memory
4196
+ should be allocated to store the clustering information before calling
4197
+ somcluster.
4198
+
4199
+ ========================================================================
4200
+ */
4201
+ { const int nobjects = (transpose==0) ? nrows : ncolumns;
4202
+ const int ndata = (transpose==0) ? ncolumns : nrows;
4203
+ int i,j;
4204
+ const int lcelldata = (celldata==NULL) ? 0 : 1;
4205
+
4206
+ if (nobjects < 2) return;
4207
+
4208
+ if (lcelldata==0)
4209
+ { celldata = malloc(nxgrid*nygrid*ndata*sizeof(double**));
4210
+ for (i = 0; i < nxgrid; i++)
4211
+ { celldata[i] = malloc(nygrid*ndata*sizeof(double*));
4212
+ for (j = 0; j < nygrid; j++)
4213
+ celldata[i][j] = malloc(ndata*sizeof(double));
4214
+ }
4215
+ }
4216
+
4217
+ somworker (nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid,
4218
+ inittau, celldata, niter, dist);
4219
+ if (clusterid)
4220
+ somassign (nrows, ncolumns, data, mask, weight, transpose,
4221
+ nxgrid, nygrid, celldata, dist, clusterid);
4222
+ if(lcelldata==0)
4223
+ { for (i = 0; i < nxgrid; i++)
4224
+ for (j = 0; j < nygrid; j++)
4225
+ free(celldata[i][j]);
4226
+ for (i = 0; i < nxgrid; i++)
4227
+ free(celldata[i]);
4228
+ free(celldata);
4229
+ }
4230
+ return;
4231
+ }
4232
+
4233
+ /* ******************************************************************** */
4234
+
4235
+ double clusterdistance (int nrows, int ncolumns, double** data,
4236
+ int** mask, double weight[], int n1, int n2, int index1[], int index2[],
4237
+ char dist, char method, int transpose)
4238
+
4239
+ /*
4240
+ Purpose
4241
+ =======
4242
+
4243
+ The clusterdistance routine calculates the distance between two clusters
4244
+ containing genes or microarrays using the measured gene expression vectors. The
4245
+ distance between clusters, given the genes/microarrays in each cluster, can be
4246
+ defined in several ways. Several distance measures can be used.
4247
+
4248
+ The routine returns the distance in double precision.
4249
+ If the parameter transpose is set to a nonzero value, the clusters are
4250
+ interpreted as clusters of microarrays, otherwise as clusters of gene.
4251
+
4252
+ Arguments
4253
+ =========
4254
+
4255
+ nrows (input) int
4256
+ The number of rows (i.e., the number of genes) in the gene expression data
4257
+ matrix.
4258
+
4259
+ ncolumns (input) int
4260
+ The number of columns (i.e., the number of microarrays) in the gene expression
4261
+ data matrix.
4262
+
4263
+ data (input) double[nrows][ncolumns]
4264
+ The array containing the data of the vectors.
4265
+
4266
+ mask (input) int[nrows][ncolumns]
4267
+ This array shows which data values are missing. If mask[i][j]==0, then
4268
+ data[i][j] is missing.
4269
+
4270
+ weight (input) double[ncolumns] if transpose==0;
4271
+ double[nrows] if transpose==1
4272
+ The weights that are used to calculate the distance.
4273
+
4274
+ n1 (input) int
4275
+ The number of elements in the first cluster.
4276
+
4277
+ n2 (input) int
4278
+ The number of elements in the second cluster.
4279
+
4280
+ index1 (input) int[n1]
4281
+ Identifies which genes/microarrays belong to the first cluster.
4282
+
4283
+ index2 (input) int[n2]
4284
+ Identifies which genes/microarrays belong to the second cluster.
4285
+
4286
+ dist (input) char
4287
+ Defines which distance measure is used, as given by the table:
4288
+ dist=='e': Euclidean distance
4289
+ dist=='b': City-block distance
4290
+ dist=='c': correlation
4291
+ dist=='a': absolute value of the correlation
4292
+ dist=='u': uncentered correlation
4293
+ dist=='x': absolute uncentered correlation
4294
+ dist=='s': Spearman's rank correlation
4295
+ dist=='k': Kendall's tau
4296
+ For other values of dist, the default (Euclidean distance) is used.
4297
+
4298
+ method (input) char
4299
+ Defines how the distance between two clusters is defined, given which genes
4300
+ belong to which cluster:
4301
+ method=='a': the distance between the arithmetic means of the two clusters
4302
+ method=='m': the distance between the medians of the two clusters
4303
+ method=='s': the smallest pairwise distance between members of the two clusters
4304
+ method=='x': the largest pairwise distance between members of the two clusters
4305
+ method=='v': average of the pairwise distances between members of the clusters
4306
+
4307
+ transpose (input) int
4308
+ If transpose is equal to zero, the distances between the rows is
4309
+ calculated. Otherwise, the distances between the columns is calculated.
4310
+ The former is needed when genes are being clustered; the latter is used
4311
+ when microarrays are being clustered.
4312
+
4313
+ ========================================================================
4314
+ */
4315
+ { /* Set the metric function as indicated by dist */
4316
+ double (*metric)
4317
+ (int, double**, double**, int**, int**, const double[], int, int, int) =
4318
+ setmetric(dist);
4319
+
4320
+ /* if one or both clusters are empty, return */
4321
+ if (n1 < 1 || n2 < 1) return -1.0;
4322
+ /* Check the indices */
4323
+ if (transpose==0)
4324
+ { int i;
4325
+ for (i = 0; i < n1; i++)
4326
+ { int index = index1[i];
4327
+ if (index < 0 || index >= nrows) return -1.0;
4328
+ }
4329
+ for (i = 0; i < n2; i++)
4330
+ { int index = index2[i];
4331
+ if (index < 0 || index >= nrows) return -1.0;
4332
+ }
4333
+ }
4334
+ else
4335
+ { int i;
4336
+ for (i = 0; i < n1; i++)
4337
+ { int index = index1[i];
4338
+ if (index < 0 || index >= ncolumns) return -1.0;
4339
+ }
4340
+ for (i = 0; i < n2; i++)
4341
+ { int index = index2[i];
4342
+ if (index < 0 || index >= ncolumns) return -1.0;
4343
+ }
4344
+ }
4345
+
4346
+ switch (method)
4347
+ { case 'a':
4348
+ { /* Find the center */
4349
+ int i,j,k;
4350
+ if (transpose==0)
4351
+ { double distance;
4352
+ double* cdata[2];
4353
+ int* cmask[2];
4354
+ int* count[2];
4355
+ count[0] = calloc(ncolumns,sizeof(int));
4356
+ count[1] = calloc(ncolumns,sizeof(int));
4357
+ cdata[0] = calloc(ncolumns,sizeof(double));
4358
+ cdata[1] = calloc(ncolumns,sizeof(double));
4359
+ cmask[0] = malloc(ncolumns*sizeof(int));
4360
+ cmask[1] = malloc(ncolumns*sizeof(int));
4361
+ for (i = 0; i < n1; i++)
4362
+ { k = index1[i];
4363
+ for (j = 0; j < ncolumns; j++)
4364
+ if (mask[k][j] != 0)
4365
+ { cdata[0][j] = cdata[0][j] + data[k][j];
4366
+ count[0][j] = count[0][j] + 1;
4367
+ }
4368
+ }
4369
+ for (i = 0; i < n2; i++)
4370
+ { k = index2[i];
4371
+ for (j = 0; j < ncolumns; j++)
4372
+ if (mask[k][j] != 0)
4373
+ { cdata[1][j] = cdata[1][j] + data[k][j];
4374
+ count[1][j] = count[1][j] + 1;
4375
+ }
4376
+ }
4377
+ for (i = 0; i < 2; i++)
4378
+ for (j = 0; j < ncolumns; j++)
4379
+ { if (count[i][j]>0)
4380
+ { cdata[i][j] = cdata[i][j] / count[i][j];
4381
+ cmask[i][j] = 1;
4382
+ }
4383
+ else
4384
+ cmask[i][j] = 0;
4385
+ }
4386
+ distance =
4387
+ metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
4388
+ for (i = 0; i < 2; i++)
4389
+ { free (cdata[i]);
4390
+ free (cmask[i]);
4391
+ free (count[i]);
4392
+ }
4393
+ return distance;
4394
+ }
4395
+ else
4396
+ { double distance;
4397
+ int** count = malloc(nrows*sizeof(int*));
4398
+ double** cdata = malloc(nrows*sizeof(double*));
4399
+ int** cmask = malloc(nrows*sizeof(int*));
4400
+ for (i = 0; i < nrows; i++)
4401
+ { count[i] = calloc(2,sizeof(int));
4402
+ cdata[i] = calloc(2,sizeof(double));
4403
+ cmask[i] = malloc(2*sizeof(int));
4404
+ }
4405
+ for (i = 0; i < n1; i++)
4406
+ { k = index1[i];
4407
+ for (j = 0; j < nrows; j++)
4408
+ { if (mask[j][k] != 0)
4409
+ { cdata[j][0] = cdata[j][0] + data[j][k];
4410
+ count[j][0] = count[j][0] + 1;
4411
+ }
4412
+ }
4413
+ }
4414
+ for (i = 0; i < n2; i++)
4415
+ { k = index2[i];
4416
+ for (j = 0; j < nrows; j++)
4417
+ { if (mask[j][k] != 0)
4418
+ { cdata[j][1] = cdata[j][1] + data[j][k];
4419
+ count[j][1] = count[j][1] + 1;
4420
+ }
4421
+ }
4422
+ }
4423
+ for (i = 0; i < nrows; i++)
4424
+ for (j = 0; j < 2; j++)
4425
+ if (count[i][j]>0)
4426
+ { cdata[i][j] = cdata[i][j] / count[i][j];
4427
+ cmask[i][j] = 1;
4428
+ }
4429
+ else
4430
+ cmask[i][j] = 0;
4431
+ distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
4432
+ for (i = 0; i < nrows; i++)
4433
+ { free (count[i]);
4434
+ free (cdata[i]);
4435
+ free (cmask[i]);
4436
+ }
4437
+ free (count);
4438
+ free (cdata);
4439
+ free (cmask);
4440
+ return distance;
4441
+ }
4442
+ }
4443
+ case 'm':
4444
+ { int i, j, k;
4445
+ if (transpose==0)
4446
+ { double distance;
4447
+ double* temp = malloc(nrows*sizeof(double));
4448
+ double* cdata[2];
4449
+ int* cmask[2];
4450
+ for (i = 0; i < 2; i++)
4451
+ { cdata[i] = malloc(ncolumns*sizeof(double));
4452
+ cmask[i] = malloc(ncolumns*sizeof(int));
4453
+ }
4454
+ for (j = 0; j < ncolumns; j++)
4455
+ { int count = 0;
4456
+ for (k = 0; k < n1; k++)
4457
+ { i = index1[k];
4458
+ if (mask[i][j])
4459
+ { temp[count] = data[i][j];
4460
+ count++;
4461
+ }
4462
+ }
4463
+ if (count>0)
4464
+ { cdata[0][j] = median (count,temp);
4465
+ cmask[0][j] = 1;
4466
+ }
4467
+ else
4468
+ { cdata[0][j] = 0.;
4469
+ cmask[0][j] = 0;
4470
+ }
4471
+ }
4472
+ for (j = 0; j < ncolumns; j++)
4473
+ { int count = 0;
4474
+ for (k = 0; k < n2; k++)
4475
+ { i = index2[k];
4476
+ if (mask[i][j])
4477
+ { temp[count] = data[i][j];
4478
+ count++;
4479
+ }
4480
+ }
4481
+ if (count>0)
4482
+ { cdata[1][j] = median (count,temp);
4483
+ cmask[1][j] = 1;
4484
+ }
4485
+ else
4486
+ { cdata[1][j] = 0.;
4487
+ cmask[1][j] = 0;
4488
+ }
4489
+ }
4490
+ distance = metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
4491
+ for (i = 0; i < 2; i++)
4492
+ { free (cdata[i]);
4493
+ free (cmask[i]);
4494
+ }
4495
+ free(temp);
4496
+ return distance;
4497
+ }
4498
+ else
4499
+ { double distance;
4500
+ double* temp = malloc(ncolumns*sizeof(double));
4501
+ double** cdata = malloc(nrows*sizeof(double*));
4502
+ int** cmask = malloc(nrows*sizeof(int*));
4503
+ for (i = 0; i < nrows; i++)
4504
+ { cdata[i] = malloc(2*sizeof(double));
4505
+ cmask[i] = malloc(2*sizeof(int));
4506
+ }
4507
+ for (j = 0; j < nrows; j++)
4508
+ { int count = 0;
4509
+ for (k = 0; k < n1; k++)
4510
+ { i = index1[k];
4511
+ if (mask[j][i])
4512
+ { temp[count] = data[j][i];
4513
+ count++;
4514
+ }
4515
+ }
4516
+ if (count>0)
4517
+ { cdata[j][0] = median (count,temp);
4518
+ cmask[j][0] = 1;
4519
+ }
4520
+ else
4521
+ { cdata[j][0] = 0.;
4522
+ cmask[j][0] = 0;
4523
+ }
4524
+ }
4525
+ for (j = 0; j < nrows; j++)
4526
+ { int count = 0;
4527
+ for (k = 0; k < n2; k++)
4528
+ { i = index2[k];
4529
+ if (mask[j][i])
4530
+ { temp[count] = data[j][i];
4531
+ count++;
4532
+ }
4533
+ }
4534
+ if (count>0)
4535
+ { cdata[j][1] = median (count,temp);
4536
+ cmask[j][1] = 1;
4537
+ }
4538
+ else
4539
+ { cdata[j][1] = 0.;
4540
+ cmask[j][1] = 0;
4541
+ }
4542
+ }
4543
+ distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
4544
+ for (i = 0; i < nrows; i++)
4545
+ { free (cdata[i]);
4546
+ free (cmask[i]);
4547
+ }
4548
+ free(cdata);
4549
+ free(cmask);
4550
+ free(temp);
4551
+ return distance;
4552
+ }
4553
+ }
4554
+ case 's':
4555
+ { int i1, i2, j1, j2;
4556
+ const int n = (transpose==0) ? ncolumns : nrows;
4557
+ double mindistance = DBL_MAX;
4558
+ for (i1 = 0; i1 < n1; i1++)
4559
+ for (i2 = 0; i2 < n2; i2++)
4560
+ { double distance;
4561
+ j1 = index1[i1];
4562
+ j2 = index2[i2];
4563
+ distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4564
+ if (distance < mindistance) mindistance = distance;
4565
+ }
4566
+ return mindistance;
4567
+ }
4568
+ case 'x':
4569
+ { int i1, i2, j1, j2;
4570
+ const int n = (transpose==0) ? ncolumns : nrows;
4571
+ double maxdistance = 0;
4572
+ for (i1 = 0; i1 < n1; i1++)
4573
+ for (i2 = 0; i2 < n2; i2++)
4574
+ { double distance;
4575
+ j1 = index1[i1];
4576
+ j2 = index2[i2];
4577
+ distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4578
+ if (distance > maxdistance) maxdistance = distance;
4579
+ }
4580
+ return maxdistance;
4581
+ }
4582
+ case 'v':
4583
+ { int i1, i2, j1, j2;
4584
+ const int n = (transpose==0) ? ncolumns : nrows;
4585
+ double distance = 0;
4586
+ for (i1 = 0; i1 < n1; i1++)
4587
+ for (i2 = 0; i2 < n2; i2++)
4588
+ { j1 = index1[i1];
4589
+ j2 = index2[i2];
4590
+ distance += metric (n,data,data,mask,mask,weight,j1,j2,transpose);
4591
+ }
4592
+ distance /= (n1*n2);
4593
+ return distance;
4594
+ }
4595
+ }
4596
+ /* Never get here */
4597
+ return -2.0;
4598
+ }