rbcluster 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +9 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE +29 -0
- data/README.md +54 -0
- data/Rakefile +17 -0
- data/examples/simple_kcluster.rb +10 -0
- data/ext/rbcluster/cluster.c +4598 -0
- data/ext/rbcluster/cluster.h +93 -0
- data/ext/rbcluster/extconf.rb +6 -0
- data/ext/rbcluster/rbcluster.c +775 -0
- data/lib/rbcluster.rb +5 -0
- data/lib/rbcluster/tree.rb +20 -0
- data/lib/rbcluster/version.rb +3 -0
- data/rbcluster.gemspec +24 -0
- data/spec/clustercentroids_spec.rb +6 -0
- data/spec/clusterdistance_spec.rb +106 -0
- data/spec/clustermedoids_spec.rb +6 -0
- data/spec/cuttree_spec.rb +6 -0
- data/spec/kcluster_spec.rb +95 -0
- data/spec/kmedoids_spec.rb +86 -0
- data/spec/median_mean_spec.rb +26 -0
- data/spec/node_spec.rb +27 -0
- data/spec/pca_spec.rb +113 -0
- data/spec/somcluster_spec.rb +81 -0
- data/spec/spec_helper.rb +3 -0
- data/spec/treecluster_spec.rb +412 -0
- metadata +110 -0
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
rbcluster
|
2
|
+
Copyright (C) 2011-2012 Jari Bakken.
|
3
|
+
|
4
|
+
The C clustering library.
|
5
|
+
Copyright (C) 2002 Michiel Jan Laurens de Hoon.
|
6
|
+
|
7
|
+
This library was written at the Laboratory of DNA Information Analysis,
|
8
|
+
Human Genome Center, Institute of Medical Science, University of Tokyo,
|
9
|
+
4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
|
10
|
+
Contact: mdehoon 'AT' gsc.riken.jp
|
11
|
+
|
12
|
+
Permission to use, copy, modify, and distribute this software and its
|
13
|
+
documentation with or without modifications and for any purpose and
|
14
|
+
without fee is hereby granted, provided that any copyright notices
|
15
|
+
appear in all copies and that both those copyright notices and this
|
16
|
+
permission notice appear in supporting documentation, and that the
|
17
|
+
names of the contributors or copyright holders not be used in
|
18
|
+
advertising or publicity pertaining to distribution of the software
|
19
|
+
without specific prior permission.
|
20
|
+
|
21
|
+
THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
|
22
|
+
WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
|
23
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
|
24
|
+
CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
|
25
|
+
OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
26
|
+
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
27
|
+
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
28
|
+
OR PERFORMANCE OF THIS SOFTWARE.
|
29
|
+
|
data/README.md
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
rbcluster
|
2
|
+
=========
|
3
|
+
|
4
|
+
Ruby bindings to the Cluster C library.
|
5
|
+
|
6
|
+
TODO
|
7
|
+
----
|
8
|
+
|
9
|
+
Functions:
|
10
|
+
|
11
|
+
* Cluster.clustercentroids
|
12
|
+
* Cluster.clustermedoids
|
13
|
+
* Cluster::Tree#{cut,slice,[],fetch}
|
14
|
+
|
15
|
+
Other:
|
16
|
+
|
17
|
+
* an examples/ folder
|
18
|
+
* make :transpose work
|
19
|
+
* specs for bad inputs
|
20
|
+
|
21
|
+
DONE
|
22
|
+
----
|
23
|
+
|
24
|
+
* Cluster.pca
|
25
|
+
* Cluster.somcluster
|
26
|
+
* Cluster.treecluster
|
27
|
+
* Cluster.clusterdistance
|
28
|
+
* Cluster.kcluster
|
29
|
+
* Cluster.kmedoids
|
30
|
+
* Cluster.distancematrix
|
31
|
+
* Cluster.median
|
32
|
+
* Cluster.mean
|
33
|
+
|
34
|
+
See also
|
35
|
+
--------
|
36
|
+
|
37
|
+
* http://bonsai.hgc.jp/~mdehoon/software/cluster/software.htm
|
38
|
+
* http://bonsai.hgc.jp/~mdehoon/software/cluster/cluster.pdf
|
39
|
+
|
40
|
+
Note on Patches/Pull Requests
|
41
|
+
-----------------------------
|
42
|
+
|
43
|
+
* Fork the project.
|
44
|
+
* Make your feature addition or bug fix.
|
45
|
+
* Add tests for it. This is important so I don't break it in a
|
46
|
+
future version unintentionally.
|
47
|
+
* Commit, do not mess with rakefile, version, or history.
|
48
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
49
|
+
* Send me a pull request. Bonus points for topic branches.
|
50
|
+
|
51
|
+
Copyright
|
52
|
+
---------
|
53
|
+
|
54
|
+
Copyright (c) 2011-2012 Jari Bakken. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
include Rake::DSL if defined?(Rake::DSL)
|
2
|
+
|
3
|
+
require 'bundler'
|
4
|
+
Bundler::GemHelper.install_tasks
|
5
|
+
|
6
|
+
require 'rake/extensiontask'
|
7
|
+
Rake::ExtensionTask.new do |ext|
|
8
|
+
ext.name = "rbcluster"
|
9
|
+
ext.lib_dir = "lib/rbcluster"
|
10
|
+
ext.ext_dir = "ext/rbcluster"
|
11
|
+
ext.gem_spec = eval(File.read("./rbcluster.gemspec"))
|
12
|
+
end
|
13
|
+
|
14
|
+
require 'rspec/core/rake_task'
|
15
|
+
RSpec::Core::RakeTask.new
|
16
|
+
|
17
|
+
task :default => [:compile, :spec]
|
@@ -0,0 +1,4598 @@
|
|
1
|
+
/* The C clustering library.
|
2
|
+
* Copyright (C) 2002 Michiel Jan Laurens de Hoon.
|
3
|
+
*
|
4
|
+
* This library was written at the Laboratory of DNA Information Analysis,
|
5
|
+
* Human Genome Center, Institute of Medical Science, University of Tokyo,
|
6
|
+
* 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
|
7
|
+
* Contact: mdehoon 'AT' gsc.riken.jp
|
8
|
+
*
|
9
|
+
* Permission to use, copy, modify, and distribute this software and its
|
10
|
+
* documentation with or without modifications and for any purpose and
|
11
|
+
* without fee is hereby granted, provided that any copyright notices
|
12
|
+
* appear in all copies and that both those copyright notices and this
|
13
|
+
* permission notice appear in supporting documentation, and that the
|
14
|
+
* names of the contributors or copyright holders not be used in
|
15
|
+
* advertising or publicity pertaining to distribution of the software
|
16
|
+
* without specific prior permission.
|
17
|
+
*
|
18
|
+
* THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
|
19
|
+
* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
|
20
|
+
* WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
|
21
|
+
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
|
22
|
+
* OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
23
|
+
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
24
|
+
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
25
|
+
* OR PERFORMANCE OF THIS SOFTWARE.
|
26
|
+
*
|
27
|
+
*/
|
28
|
+
|
29
|
+
#include <time.h>
|
30
|
+
#include <stdlib.h>
|
31
|
+
#include <math.h>
|
32
|
+
#include <float.h>
|
33
|
+
#include <limits.h>
|
34
|
+
#include <string.h>
|
35
|
+
#include "cluster.h"
|
36
|
+
#ifdef WINDOWS
|
37
|
+
# include <windows.h>
|
38
|
+
#endif
|
39
|
+
|
40
|
+
/* ************************************************************************ */
|
41
|
+
|
42
|
+
#ifdef WINDOWS
|
43
|
+
/* Then we make a Windows DLL */
|
44
|
+
int WINAPI
|
45
|
+
clusterdll_init (HANDLE h, DWORD reason, void* foo)
|
46
|
+
{
|
47
|
+
return 1;
|
48
|
+
}
|
49
|
+
#endif
|
50
|
+
|
51
|
+
/* ************************************************************************ */
|
52
|
+
|
53
|
+
double mean(int n, double x[])
|
54
|
+
{ double result = 0.;
|
55
|
+
int i;
|
56
|
+
for (i = 0; i < n; i++) result += x[i];
|
57
|
+
result /= n;
|
58
|
+
return result;
|
59
|
+
}
|
60
|
+
|
61
|
+
/* ************************************************************************ */
|
62
|
+
|
63
|
+
double median (int n, double x[])
|
64
|
+
/*
|
65
|
+
Find the median of X(1), ... , X(N), using as much of the quicksort
|
66
|
+
algorithm as is needed to isolate it.
|
67
|
+
N.B. On exit, the array X is partially ordered.
|
68
|
+
Based on Alan J. Miller's median.f90 routine.
|
69
|
+
*/
|
70
|
+
|
71
|
+
{ int i, j;
|
72
|
+
int nr = n / 2;
|
73
|
+
int nl = nr - 1;
|
74
|
+
int even = 0;
|
75
|
+
/* hi & lo are position limits encompassing the median. */
|
76
|
+
int lo = 0;
|
77
|
+
int hi = n-1;
|
78
|
+
|
79
|
+
if (n==2*nr) even = 1;
|
80
|
+
if (n<3)
|
81
|
+
{ if (n<1) return 0.;
|
82
|
+
if (n == 1) return x[0];
|
83
|
+
return 0.5*(x[0]+x[1]);
|
84
|
+
}
|
85
|
+
|
86
|
+
/* Find median of 1st, middle & last values. */
|
87
|
+
do
|
88
|
+
{ int loop;
|
89
|
+
int mid = (lo + hi)/2;
|
90
|
+
double result = x[mid];
|
91
|
+
double xlo = x[lo];
|
92
|
+
double xhi = x[hi];
|
93
|
+
if (xhi<xlo)
|
94
|
+
{ double temp = xlo;
|
95
|
+
xlo = xhi;
|
96
|
+
xhi = temp;
|
97
|
+
}
|
98
|
+
if (result>xhi) result = xhi;
|
99
|
+
else if (result<xlo) result = xlo;
|
100
|
+
/* The basic quicksort algorithm to move all values <= the sort key (XMED)
|
101
|
+
* to the left-hand end, and all higher values to the other end.
|
102
|
+
*/
|
103
|
+
i = lo;
|
104
|
+
j = hi;
|
105
|
+
do
|
106
|
+
{ while (x[i]<result) i++;
|
107
|
+
while (x[j]>result) j--;
|
108
|
+
loop = 0;
|
109
|
+
if (i<j)
|
110
|
+
{ double temp = x[i];
|
111
|
+
x[i] = x[j];
|
112
|
+
x[j] = temp;
|
113
|
+
i++;
|
114
|
+
j--;
|
115
|
+
if (i<=j) loop = 1;
|
116
|
+
}
|
117
|
+
} while (loop); /* Decide which half the median is in. */
|
118
|
+
|
119
|
+
if (even)
|
120
|
+
{ if (j==nl && i==nr)
|
121
|
+
/* Special case, n even, j = n/2 & i = j + 1, so the median is
|
122
|
+
* between the two halves of the series. Find max. of the first
|
123
|
+
* half & min. of the second half, then average.
|
124
|
+
*/
|
125
|
+
{ int k;
|
126
|
+
double xmax = x[0];
|
127
|
+
double xmin = x[n-1];
|
128
|
+
for (k = lo; k <= j; k++) xmax = max(xmax,x[k]);
|
129
|
+
for (k = i; k <= hi; k++) xmin = min(xmin,x[k]);
|
130
|
+
return 0.5*(xmin + xmax);
|
131
|
+
}
|
132
|
+
if (j<nl) lo = i;
|
133
|
+
if (i>nr) hi = j;
|
134
|
+
if (i==j)
|
135
|
+
{ if (i==nl) lo = nl;
|
136
|
+
if (j==nr) hi = nr;
|
137
|
+
}
|
138
|
+
}
|
139
|
+
else
|
140
|
+
{ if (j<nr) lo = i;
|
141
|
+
if (i>nr) hi = j;
|
142
|
+
/* Test whether median has been isolated. */
|
143
|
+
if (i==j && i==nr) return result;
|
144
|
+
}
|
145
|
+
}
|
146
|
+
while (lo<hi-1);
|
147
|
+
|
148
|
+
if (even) return (0.5*(x[nl]+x[nr]));
|
149
|
+
if (x[lo]>x[hi])
|
150
|
+
{ double temp = x[lo];
|
151
|
+
x[lo] = x[hi];
|
152
|
+
x[hi] = temp;
|
153
|
+
}
|
154
|
+
return x[nr];
|
155
|
+
}
|
156
|
+
|
157
|
+
/* ********************************************************************** */
|
158
|
+
|
159
|
+
static const double* sortdata = NULL; /* used in the quicksort algorithm */
|
160
|
+
|
161
|
+
/* ---------------------------------------------------------------------- */
|
162
|
+
|
163
|
+
static
|
164
|
+
int compare(const void* a, const void* b)
|
165
|
+
/* Helper function for sort. Previously, this was a nested function under
|
166
|
+
* sort, which is not allowed under ANSI C.
|
167
|
+
*/
|
168
|
+
{ const int i1 = *(const int*)a;
|
169
|
+
const int i2 = *(const int*)b;
|
170
|
+
const double term1 = sortdata[i1];
|
171
|
+
const double term2 = sortdata[i2];
|
172
|
+
if (term1 < term2) return -1;
|
173
|
+
if (term1 > term2) return +1;
|
174
|
+
return 0;
|
175
|
+
}
|
176
|
+
|
177
|
+
/* ---------------------------------------------------------------------- */
|
178
|
+
|
179
|
+
void sort(int n, const double data[], int index[])
|
180
|
+
/* Sets up an index table given the data, such that data[index[]] is in
|
181
|
+
* increasing order. Sorting is done on the indices; the array data
|
182
|
+
* is unchanged.
|
183
|
+
*/
|
184
|
+
{ int i;
|
185
|
+
sortdata = data;
|
186
|
+
for (i = 0; i < n; i++) index[i] = i;
|
187
|
+
qsort(index, n, sizeof(int), compare);
|
188
|
+
}
|
189
|
+
|
190
|
+
/* ********************************************************************** */
|
191
|
+
|
192
|
+
static double* getrank (int n, double data[])
|
193
|
+
/* Calculates the ranks of the elements in the array data. Two elements with
|
194
|
+
* the same value get the same rank, equal to the average of the ranks had the
|
195
|
+
* elements different values. The ranks are returned as a newly allocated
|
196
|
+
* array that should be freed by the calling routine. If getrank fails due to
|
197
|
+
* a memory allocation error, it returns NULL.
|
198
|
+
*/
|
199
|
+
{ int i;
|
200
|
+
double* rank;
|
201
|
+
int* index;
|
202
|
+
rank = malloc(n*sizeof(double));
|
203
|
+
if (!rank) return NULL;
|
204
|
+
index = malloc(n*sizeof(int));
|
205
|
+
if (!index)
|
206
|
+
{ free(rank);
|
207
|
+
return NULL;
|
208
|
+
}
|
209
|
+
/* Call sort to get an index table */
|
210
|
+
sort (n, data, index);
|
211
|
+
/* Build a rank table */
|
212
|
+
for (i = 0; i < n; i++) rank[index[i]] = i;
|
213
|
+
/* Fix for equal ranks */
|
214
|
+
i = 0;
|
215
|
+
while (i < n)
|
216
|
+
{ int m;
|
217
|
+
double value = data[index[i]];
|
218
|
+
int j = i + 1;
|
219
|
+
while (j < n && data[index[j]] == value) j++;
|
220
|
+
m = j - i; /* number of equal ranks found */
|
221
|
+
value = rank[index[i]] + (m-1)/2.;
|
222
|
+
for (j = i; j < i + m; j++) rank[index[j]] = value;
|
223
|
+
i += m;
|
224
|
+
}
|
225
|
+
free (index);
|
226
|
+
return rank;
|
227
|
+
}
|
228
|
+
|
229
|
+
/* ---------------------------------------------------------------------- */
|
230
|
+
|
231
|
+
static int
|
232
|
+
makedatamask(int nrows, int ncols, double*** pdata, int*** pmask)
|
233
|
+
{ int i;
|
234
|
+
double** data;
|
235
|
+
int** mask;
|
236
|
+
data = malloc(nrows*sizeof(double*));
|
237
|
+
if(!data) return 0;
|
238
|
+
mask = malloc(nrows*sizeof(int*));
|
239
|
+
if(!mask)
|
240
|
+
{ free(data);
|
241
|
+
return 0;
|
242
|
+
}
|
243
|
+
for (i = 0; i < nrows; i++)
|
244
|
+
{ data[i] = malloc(ncols*sizeof(double));
|
245
|
+
if(!data[i]) break;
|
246
|
+
mask[i] = malloc(ncols*sizeof(int));
|
247
|
+
if(!mask[i])
|
248
|
+
{ free(data[i]);
|
249
|
+
break;
|
250
|
+
}
|
251
|
+
}
|
252
|
+
if (i==nrows) /* break not encountered */
|
253
|
+
{ *pdata = data;
|
254
|
+
*pmask = mask;
|
255
|
+
return 1;
|
256
|
+
}
|
257
|
+
*pdata = NULL;
|
258
|
+
*pmask = NULL;
|
259
|
+
nrows = i;
|
260
|
+
for (i = 0; i < nrows; i++)
|
261
|
+
{ free(data[i]);
|
262
|
+
free(mask[i]);
|
263
|
+
}
|
264
|
+
free(data);
|
265
|
+
free(mask);
|
266
|
+
return 0;
|
267
|
+
}
|
268
|
+
|
269
|
+
/* ---------------------------------------------------------------------- */
|
270
|
+
|
271
|
+
static void
|
272
|
+
freedatamask(int n, double** data, int** mask)
|
273
|
+
{ int i;
|
274
|
+
for (i = 0; i < n; i++)
|
275
|
+
{ free(mask[i]);
|
276
|
+
free(data[i]);
|
277
|
+
}
|
278
|
+
free(mask);
|
279
|
+
free(data);
|
280
|
+
}
|
281
|
+
|
282
|
+
/* ---------------------------------------------------------------------- */
|
283
|
+
|
284
|
+
static
|
285
|
+
double find_closest_pair(int n, double** distmatrix, int* ip, int* jp)
|
286
|
+
/*
|
287
|
+
This function searches the distance matrix to find the pair with the shortest
|
288
|
+
distance between them. The indices of the pair are returned in ip and jp; the
|
289
|
+
distance itself is returned by the function.
|
290
|
+
|
291
|
+
n (input) int
|
292
|
+
The number of elements in the distance matrix.
|
293
|
+
|
294
|
+
distmatrix (input) double**
|
295
|
+
A ragged array containing the distance matrix. The number of columns in each
|
296
|
+
row is one less than the row index.
|
297
|
+
|
298
|
+
ip (output) int*
|
299
|
+
A pointer to the integer that is to receive the first index of the pair with
|
300
|
+
the shortest distance.
|
301
|
+
|
302
|
+
jp (output) int*
|
303
|
+
A pointer to the integer that is to receive the second index of the pair with
|
304
|
+
the shortest distance.
|
305
|
+
*/
|
306
|
+
{ int i, j;
|
307
|
+
double temp;
|
308
|
+
double distance = distmatrix[1][0];
|
309
|
+
*ip = 1;
|
310
|
+
*jp = 0;
|
311
|
+
for (i = 1; i < n; i++)
|
312
|
+
{ for (j = 0; j < i; j++)
|
313
|
+
{ temp = distmatrix[i][j];
|
314
|
+
if (temp<distance)
|
315
|
+
{ distance = temp;
|
316
|
+
*ip = i;
|
317
|
+
*jp = j;
|
318
|
+
}
|
319
|
+
}
|
320
|
+
}
|
321
|
+
return distance;
|
322
|
+
}
|
323
|
+
|
324
|
+
/* ********************************************************************* */
|
325
|
+
|
326
|
+
static int svd(int m, int n, double** u, double w[], double** vt)
|
327
|
+
/*
|
328
|
+
* This subroutine is a translation of the Algol procedure svd,
|
329
|
+
* Num. Math. 14, 403-420(1970) by Golub and Reinsch.
|
330
|
+
* Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971).
|
331
|
+
*
|
332
|
+
* This subroutine determines the singular value decomposition
|
333
|
+
* t
|
334
|
+
* A=usv of a real m by n rectangular matrix, where m is greater
|
335
|
+
* than or equal to n. Householder bidiagonalization and a variant
|
336
|
+
* of the QR algorithm are used.
|
337
|
+
*
|
338
|
+
*
|
339
|
+
* On input.
|
340
|
+
*
|
341
|
+
* m is the number of rows of A (and u).
|
342
|
+
*
|
343
|
+
* n is the number of columns of A (and u) and the order of v.
|
344
|
+
*
|
345
|
+
* u contains the rectangular input matrix A to be decomposed.
|
346
|
+
*
|
347
|
+
* On output.
|
348
|
+
*
|
349
|
+
* the routine returns an integer ierr equal to
|
350
|
+
* 0 to indicate a normal return,
|
351
|
+
* k if the k-th singular value has not been
|
352
|
+
* determined after 30 iterations,
|
353
|
+
* -1 if memory allocation fails.
|
354
|
+
*
|
355
|
+
*
|
356
|
+
* w contains the n (non-negative) singular values of a (the
|
357
|
+
* diagonal elements of s). they are unordered. if an
|
358
|
+
* error exit is made, the singular values should be correct
|
359
|
+
* for indices ierr+1,ierr+2,...,n.
|
360
|
+
*
|
361
|
+
*
|
362
|
+
* u contains the matrix u (orthogonal column vectors) of the
|
363
|
+
* decomposition.
|
364
|
+
* if an error exit is made, the columns of u corresponding
|
365
|
+
* to indices of correct singular values should be correct.
|
366
|
+
*
|
367
|
+
* t
|
368
|
+
* vt contains the matrix v (orthogonal) of the decomposition.
|
369
|
+
* if an error exit is made, the columns of v corresponding
|
370
|
+
* to indices of correct singular values should be correct.
|
371
|
+
*
|
372
|
+
*
|
373
|
+
* Questions and comments should be directed to B. S. Garbow,
|
374
|
+
* Applied Mathematics division, Argonne National Laboratory
|
375
|
+
*
|
376
|
+
* Modified to eliminate machep
|
377
|
+
*
|
378
|
+
* Translated to C by Michiel de Hoon, Human Genome Center,
|
379
|
+
* University of Tokyo, for inclusion in the C Clustering Library.
|
380
|
+
* This routine is less general than the original svd routine, as
|
381
|
+
* it focuses on the singular value decomposition as needed for
|
382
|
+
* clustering. In particular,
|
383
|
+
* - We calculate both u and v in all cases
|
384
|
+
* - We pass the input array A via u; this array is subsequently
|
385
|
+
* overwritten.
|
386
|
+
* - We allocate for the array rv1, used as a working space,
|
387
|
+
* internally in this routine, instead of passing it as an
|
388
|
+
* argument. If the allocation fails, svd returns -1.
|
389
|
+
* 2003.06.05
|
390
|
+
*/
|
391
|
+
{ int i, j, k, i1, k1, l1, its;
|
392
|
+
double c,f,h,s,x,y,z;
|
393
|
+
int l = 0;
|
394
|
+
int ierr = 0;
|
395
|
+
double g = 0.0;
|
396
|
+
double scale = 0.0;
|
397
|
+
double anorm = 0.0;
|
398
|
+
double* rv1 = malloc(n*sizeof(double));
|
399
|
+
if (!rv1) return -1;
|
400
|
+
if (m >= n)
|
401
|
+
{ /* Householder reduction to bidiagonal form */
|
402
|
+
for (i = 0; i < n; i++)
|
403
|
+
{ l = i + 1;
|
404
|
+
rv1[i] = scale * g;
|
405
|
+
g = 0.0;
|
406
|
+
s = 0.0;
|
407
|
+
scale = 0.0;
|
408
|
+
for (k = i; k < m; k++) scale += fabs(u[k][i]);
|
409
|
+
if (scale != 0.0)
|
410
|
+
{ for (k = i; k < m; k++)
|
411
|
+
{ u[k][i] /= scale;
|
412
|
+
s += u[k][i]*u[k][i];
|
413
|
+
}
|
414
|
+
f = u[i][i];
|
415
|
+
g = (f >= 0) ? -sqrt(s) : sqrt(s);
|
416
|
+
h = f * g - s;
|
417
|
+
u[i][i] = f - g;
|
418
|
+
if (i < n-1)
|
419
|
+
{ for (j = l; j < n; j++)
|
420
|
+
{ s = 0.0;
|
421
|
+
for (k = i; k < m; k++) s += u[k][i] * u[k][j];
|
422
|
+
f = s / h;
|
423
|
+
for (k = i; k < m; k++) u[k][j] += f * u[k][i];
|
424
|
+
}
|
425
|
+
}
|
426
|
+
for (k = i; k < m; k++) u[k][i] *= scale;
|
427
|
+
}
|
428
|
+
w[i] = scale * g;
|
429
|
+
g = 0.0;
|
430
|
+
s = 0.0;
|
431
|
+
scale = 0.0;
|
432
|
+
if (i<n-1)
|
433
|
+
{ for (k = l; k < n; k++) scale += fabs(u[i][k]);
|
434
|
+
if (scale != 0.0)
|
435
|
+
{ for (k = l; k < n; k++)
|
436
|
+
{ u[i][k] /= scale;
|
437
|
+
s += u[i][k] * u[i][k];
|
438
|
+
}
|
439
|
+
f = u[i][l];
|
440
|
+
g = (f >= 0) ? -sqrt(s) : sqrt(s);
|
441
|
+
h = f * g - s;
|
442
|
+
u[i][l] = f - g;
|
443
|
+
for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
|
444
|
+
for (j = l; j < m; j++)
|
445
|
+
{ s = 0.0;
|
446
|
+
for (k = l; k < n; k++) s += u[j][k] * u[i][k];
|
447
|
+
for (k = l; k < n; k++) u[j][k] += s * rv1[k];
|
448
|
+
}
|
449
|
+
for (k = l; k < n; k++) u[i][k] *= scale;
|
450
|
+
}
|
451
|
+
}
|
452
|
+
anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
|
453
|
+
}
|
454
|
+
/* accumulation of right-hand transformations */
|
455
|
+
for (i = n-1; i>=0; i--)
|
456
|
+
{ if (i < n-1)
|
457
|
+
{ if (g != 0.0)
|
458
|
+
{ for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g;
|
459
|
+
/* double division avoids possible underflow */
|
460
|
+
for (j = l; j < n; j++)
|
461
|
+
{ s = 0.0;
|
462
|
+
for (k = l; k < n; k++) s += u[i][k] * vt[j][k];
|
463
|
+
for (k = l; k < n; k++) vt[j][k] += s * vt[i][k];
|
464
|
+
}
|
465
|
+
}
|
466
|
+
}
|
467
|
+
for (j = l; j < n; j++)
|
468
|
+
{ vt[j][i] = 0.0;
|
469
|
+
vt[i][j] = 0.0;
|
470
|
+
}
|
471
|
+
vt[i][i] = 1.0;
|
472
|
+
g = rv1[i];
|
473
|
+
l = i;
|
474
|
+
}
|
475
|
+
/* accumulation of left-hand transformations */
|
476
|
+
for (i = n-1; i >= 0; i--)
|
477
|
+
{ l = i + 1;
|
478
|
+
g = w[i];
|
479
|
+
if (i!=n-1)
|
480
|
+
for (j = l; j < n; j++) u[i][j] = 0.0;
|
481
|
+
if (g!=0.0)
|
482
|
+
{ if (i!=n-1)
|
483
|
+
{ for (j = l; j < n; j++)
|
484
|
+
{ s = 0.0;
|
485
|
+
for (k = l; k < m; k++) s += u[k][i] * u[k][j];
|
486
|
+
/* double division avoids possible underflow */
|
487
|
+
f = (s / u[i][i]) / g;
|
488
|
+
for (k = i; k < m; k++) u[k][j] += f * u[k][i];
|
489
|
+
}
|
490
|
+
}
|
491
|
+
for (j = i; j < m; j++) u[j][i] /= g;
|
492
|
+
}
|
493
|
+
else
|
494
|
+
for (j = i; j < m; j++) u[j][i] = 0.0;
|
495
|
+
u[i][i] += 1.0;
|
496
|
+
}
|
497
|
+
/* diagonalization of the bidiagonal form */
|
498
|
+
for (k = n-1; k >= 0; k--)
|
499
|
+
{ k1 = k-1;
|
500
|
+
its = 0;
|
501
|
+
while(1)
|
502
|
+
/* test for splitting */
|
503
|
+
{ for (l = k; l >= 0; l--)
|
504
|
+
{ l1 = l-1;
|
505
|
+
if (fabs(rv1[l]) + anorm == anorm) break;
|
506
|
+
/* rv1[0] is always zero, so there is no exit
|
507
|
+
* through the bottom of the loop */
|
508
|
+
if (fabs(w[l1]) + anorm == anorm)
|
509
|
+
/* cancellation of rv1[l] if l greater than 0 */
|
510
|
+
{ c = 0.0;
|
511
|
+
s = 1.0;
|
512
|
+
for (i = l; i <= k; i++)
|
513
|
+
{ f = s * rv1[i];
|
514
|
+
rv1[i] *= c;
|
515
|
+
if (fabs(f) + anorm == anorm) break;
|
516
|
+
g = w[i];
|
517
|
+
h = sqrt(f*f+g*g);
|
518
|
+
w[i] = h;
|
519
|
+
c = g / h;
|
520
|
+
s = -f / h;
|
521
|
+
for (j = 0; j < m; j++)
|
522
|
+
{ y = u[j][l1];
|
523
|
+
z = u[j][i];
|
524
|
+
u[j][l1] = y * c + z * s;
|
525
|
+
u[j][i] = -y * s + z * c;
|
526
|
+
}
|
527
|
+
}
|
528
|
+
break;
|
529
|
+
}
|
530
|
+
}
|
531
|
+
/* test for convergence */
|
532
|
+
z = w[k];
|
533
|
+
if (l==k) /* convergence */
|
534
|
+
{ if (z < 0.0)
|
535
|
+
/* w[k] is made non-negative */
|
536
|
+
{ w[k] = -z;
|
537
|
+
for (j = 0; j < n; j++) vt[k][j] = -vt[k][j];
|
538
|
+
}
|
539
|
+
break;
|
540
|
+
}
|
541
|
+
else if (its==30)
|
542
|
+
{ ierr = k;
|
543
|
+
break;
|
544
|
+
}
|
545
|
+
else
|
546
|
+
/* shift from bottom 2 by 2 minor */
|
547
|
+
{ its++;
|
548
|
+
x = w[l];
|
549
|
+
y = w[k1];
|
550
|
+
g = rv1[k1];
|
551
|
+
h = rv1[k];
|
552
|
+
f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
|
553
|
+
g = sqrt(f*f+1.0);
|
554
|
+
f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
|
555
|
+
/* next qr transformation */
|
556
|
+
c = 1.0;
|
557
|
+
s = 1.0;
|
558
|
+
for (i1 = l; i1 <= k1; i1++)
|
559
|
+
{ i = i1 + 1;
|
560
|
+
g = rv1[i];
|
561
|
+
y = w[i];
|
562
|
+
h = s * g;
|
563
|
+
g = c * g;
|
564
|
+
z = sqrt(f*f+h*h);
|
565
|
+
rv1[i1] = z;
|
566
|
+
c = f / z;
|
567
|
+
s = h / z;
|
568
|
+
f = x * c + g * s;
|
569
|
+
g = -x * s + g * c;
|
570
|
+
h = y * s;
|
571
|
+
y = y * c;
|
572
|
+
for (j = 0; j < n; j++)
|
573
|
+
{ x = vt[i1][j];
|
574
|
+
z = vt[i][j];
|
575
|
+
vt[i1][j] = x * c + z * s;
|
576
|
+
vt[i][j] = -x * s + z * c;
|
577
|
+
}
|
578
|
+
z = sqrt(f*f+h*h);
|
579
|
+
w[i1] = z;
|
580
|
+
/* rotation can be arbitrary if z is zero */
|
581
|
+
if (z!=0.0)
|
582
|
+
{ c = f / z;
|
583
|
+
s = h / z;
|
584
|
+
}
|
585
|
+
f = c * g + s * y;
|
586
|
+
x = -s * g + c * y;
|
587
|
+
for (j = 0; j < m; j++)
|
588
|
+
{ y = u[j][i1];
|
589
|
+
z = u[j][i];
|
590
|
+
u[j][i1] = y * c + z * s;
|
591
|
+
u[j][i] = -y * s + z * c;
|
592
|
+
}
|
593
|
+
}
|
594
|
+
rv1[l] = 0.0;
|
595
|
+
rv1[k] = f;
|
596
|
+
w[k] = x;
|
597
|
+
}
|
598
|
+
}
|
599
|
+
}
|
600
|
+
}
|
601
|
+
else /* m < n */
|
602
|
+
{ /* Householder reduction to bidiagonal form */
|
603
|
+
for (i = 0; i < m; i++)
|
604
|
+
{ l = i + 1;
|
605
|
+
rv1[i] = scale * g;
|
606
|
+
g = 0.0;
|
607
|
+
s = 0.0;
|
608
|
+
scale = 0.0;
|
609
|
+
for (k = i; k < n; k++) scale += fabs(u[i][k]);
|
610
|
+
if (scale != 0.0)
|
611
|
+
{ for (k = i; k < n; k++)
|
612
|
+
{ u[i][k] /= scale;
|
613
|
+
s += u[i][k]*u[i][k];
|
614
|
+
}
|
615
|
+
f = u[i][i];
|
616
|
+
g = (f >= 0) ? -sqrt(s) : sqrt(s);
|
617
|
+
h = f * g - s;
|
618
|
+
u[i][i] = f - g;
|
619
|
+
if (i < m-1)
|
620
|
+
{ for (j = l; j < m; j++)
|
621
|
+
{ s = 0.0;
|
622
|
+
for (k = i; k < n; k++) s += u[i][k] * u[j][k];
|
623
|
+
f = s / h;
|
624
|
+
for (k = i; k < n; k++) u[j][k] += f * u[i][k];
|
625
|
+
}
|
626
|
+
}
|
627
|
+
for (k = i; k < n; k++) u[i][k] *= scale;
|
628
|
+
}
|
629
|
+
w[i] = scale * g;
|
630
|
+
g = 0.0;
|
631
|
+
s = 0.0;
|
632
|
+
scale = 0.0;
|
633
|
+
if (i<m-1)
|
634
|
+
{ for (k = l; k < m; k++) scale += fabs(u[k][i]);
|
635
|
+
if (scale != 0.0)
|
636
|
+
{ for (k = l; k < m; k++)
|
637
|
+
{ u[k][i] /= scale;
|
638
|
+
s += u[k][i] * u[k][i];
|
639
|
+
}
|
640
|
+
f = u[l][i];
|
641
|
+
g = (f >= 0) ? -sqrt(s) : sqrt(s);
|
642
|
+
h = f * g - s;
|
643
|
+
u[l][i] = f - g;
|
644
|
+
for (k = l; k < m; k++) rv1[k] = u[k][i] / h;
|
645
|
+
for (j = l; j < n; j++)
|
646
|
+
{ s = 0.0;
|
647
|
+
for (k = l; k < m; k++) s += u[k][j] * u[k][i];
|
648
|
+
for (k = l; k < m; k++) u[k][j] += s * rv1[k];
|
649
|
+
}
|
650
|
+
for (k = l; k < m; k++) u[k][i] *= scale;
|
651
|
+
}
|
652
|
+
}
|
653
|
+
anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
|
654
|
+
}
|
655
|
+
/* accumulation of right-hand transformations */
|
656
|
+
for (i = m-1; i>=0; i--)
|
657
|
+
{ if (i < m-1)
|
658
|
+
{ if (g != 0.0)
|
659
|
+
{ for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g;
|
660
|
+
/* double division avoids possible underflow */
|
661
|
+
for (j = l; j < m; j++)
|
662
|
+
{ s = 0.0;
|
663
|
+
for (k = l; k < m; k++) s += u[k][i] * vt[k][j];
|
664
|
+
for (k = l; k < m; k++) vt[k][j] += s * vt[k][i];
|
665
|
+
}
|
666
|
+
}
|
667
|
+
}
|
668
|
+
for (j = l; j < m; j++)
|
669
|
+
{ vt[i][j] = 0.0;
|
670
|
+
vt[j][i] = 0.0;
|
671
|
+
}
|
672
|
+
vt[i][i] = 1.0;
|
673
|
+
g = rv1[i];
|
674
|
+
l = i;
|
675
|
+
}
|
676
|
+
/* accumulation of left-hand transformations */
|
677
|
+
for (i = m-1; i >= 0; i--)
|
678
|
+
{ l = i + 1;
|
679
|
+
g = w[i];
|
680
|
+
if (i!=m-1)
|
681
|
+
for (j = l; j < m; j++) u[j][i] = 0.0;
|
682
|
+
if (g!=0.0)
|
683
|
+
{ if (i!=m-1)
|
684
|
+
{ for (j = l; j < m; j++)
|
685
|
+
{ s = 0.0;
|
686
|
+
for (k = l; k < n; k++) s += u[i][k] * u[j][k];
|
687
|
+
/* double division avoids possible underflow */
|
688
|
+
f = (s / u[i][i]) / g;
|
689
|
+
for (k = i; k < n; k++) u[j][k] += f * u[i][k];
|
690
|
+
}
|
691
|
+
}
|
692
|
+
for (j = i; j < n; j++) u[i][j] /= g;
|
693
|
+
}
|
694
|
+
else
|
695
|
+
for (j = i; j < n; j++) u[i][j] = 0.0;
|
696
|
+
u[i][i] += 1.0;
|
697
|
+
}
|
698
|
+
/* diagonalization of the bidiagonal form */
|
699
|
+
for (k = m-1; k >= 0; k--)
|
700
|
+
{ k1 = k-1;
|
701
|
+
its = 0;
|
702
|
+
while(1)
|
703
|
+
/* test for splitting */
|
704
|
+
{ for (l = k; l >= 0; l--)
|
705
|
+
{ l1 = l-1;
|
706
|
+
if (fabs(rv1[l]) + anorm == anorm) break;
|
707
|
+
/* rv1[0] is always zero, so there is no exit
|
708
|
+
* through the bottom of the loop */
|
709
|
+
if (fabs(w[l1]) + anorm == anorm)
|
710
|
+
/* cancellation of rv1[l] if l greater than 0 */
|
711
|
+
{ c = 0.0;
|
712
|
+
s = 1.0;
|
713
|
+
for (i = l; i <= k; i++)
|
714
|
+
{ f = s * rv1[i];
|
715
|
+
rv1[i] *= c;
|
716
|
+
if (fabs(f) + anorm == anorm) break;
|
717
|
+
g = w[i];
|
718
|
+
h = sqrt(f*f+g*g);
|
719
|
+
w[i] = h;
|
720
|
+
c = g / h;
|
721
|
+
s = -f / h;
|
722
|
+
for (j = 0; j < n; j++)
|
723
|
+
{ y = u[l1][j];
|
724
|
+
z = u[i][j];
|
725
|
+
u[l1][j] = y * c + z * s;
|
726
|
+
u[i][j] = -y * s + z * c;
|
727
|
+
}
|
728
|
+
}
|
729
|
+
break;
|
730
|
+
}
|
731
|
+
}
|
732
|
+
/* test for convergence */
|
733
|
+
z = w[k];
|
734
|
+
if (l==k) /* convergence */
|
735
|
+
{ if (z < 0.0)
|
736
|
+
/* w[k] is made non-negative */
|
737
|
+
{ w[k] = -z;
|
738
|
+
for (j = 0; j < m; j++) vt[j][k] = -vt[j][k];
|
739
|
+
}
|
740
|
+
break;
|
741
|
+
}
|
742
|
+
else if (its==30)
|
743
|
+
{ ierr = k;
|
744
|
+
break;
|
745
|
+
}
|
746
|
+
else
|
747
|
+
/* shift from bottom 2 by 2 minor */
|
748
|
+
{ its++;
|
749
|
+
x = w[l];
|
750
|
+
y = w[k1];
|
751
|
+
g = rv1[k1];
|
752
|
+
h = rv1[k];
|
753
|
+
f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
|
754
|
+
g = sqrt(f*f+1.0);
|
755
|
+
f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
|
756
|
+
/* next qr transformation */
|
757
|
+
c = 1.0;
|
758
|
+
s = 1.0;
|
759
|
+
for (i1 = l; i1 <= k1; i1++)
|
760
|
+
{ i = i1 + 1;
|
761
|
+
g = rv1[i];
|
762
|
+
y = w[i];
|
763
|
+
h = s * g;
|
764
|
+
g = c * g;
|
765
|
+
z = sqrt(f*f+h*h);
|
766
|
+
rv1[i1] = z;
|
767
|
+
c = f / z;
|
768
|
+
s = h / z;
|
769
|
+
f = x * c + g * s;
|
770
|
+
g = -x * s + g * c;
|
771
|
+
h = y * s;
|
772
|
+
y = y * c;
|
773
|
+
for (j = 0; j < m; j++)
|
774
|
+
{ x = vt[j][i1];
|
775
|
+
z = vt[j][i];
|
776
|
+
vt[j][i1] = x * c + z * s;
|
777
|
+
vt[j][i] = -x * s + z * c;
|
778
|
+
}
|
779
|
+
z = sqrt(f*f+h*h);
|
780
|
+
w[i1] = z;
|
781
|
+
/* rotation can be arbitrary if z is zero */
|
782
|
+
if (z!=0.0)
|
783
|
+
{ c = f / z;
|
784
|
+
s = h / z;
|
785
|
+
}
|
786
|
+
f = c * g + s * y;
|
787
|
+
x = -s * g + c * y;
|
788
|
+
for (j = 0; j < n; j++)
|
789
|
+
{ y = u[i1][j];
|
790
|
+
z = u[i][j];
|
791
|
+
u[i1][j] = y * c + z * s;
|
792
|
+
u[i][j] = -y * s + z * c;
|
793
|
+
}
|
794
|
+
}
|
795
|
+
rv1[l] = 0.0;
|
796
|
+
rv1[k] = f;
|
797
|
+
w[k] = x;
|
798
|
+
}
|
799
|
+
}
|
800
|
+
}
|
801
|
+
}
|
802
|
+
free(rv1);
|
803
|
+
return ierr;
|
804
|
+
}
|
805
|
+
|
806
|
+
/* ********************************************************************* */
|
807
|
+
|
808
|
+
int pca(int nrows, int ncolumns, double** u, double** v, double* w)
|
809
|
+
/*
|
810
|
+
Purpose
|
811
|
+
=======
|
812
|
+
|
813
|
+
This subroutine uses the singular value decomposition to perform principal
|
814
|
+
components analysis of a real nrows by ncolumns rectangular matrix.
|
815
|
+
|
816
|
+
Arguments
|
817
|
+
=========
|
818
|
+
|
819
|
+
nrows (input) int
|
820
|
+
The number of rows in the matrix u.
|
821
|
+
|
822
|
+
ncolumns (input) int
|
823
|
+
The number of columns in the matrix v.
|
824
|
+
|
825
|
+
u (input) double[nrows][ncolumns]
|
826
|
+
On input, the array containing the data to which the principal component
|
827
|
+
analysis should be applied. The function assumes that the mean has already been
|
828
|
+
subtracted of each column, and hence that the mean of each column is zero.
|
829
|
+
On output, see below.
|
830
|
+
|
831
|
+
v (input) double[n][n], where n = min(nrows, ncolumns)
|
832
|
+
Not used on input.
|
833
|
+
|
834
|
+
w (input) double[n], where n = min(nrows, ncolumns)
|
835
|
+
Not used on input.
|
836
|
+
|
837
|
+
|
838
|
+
Return value
|
839
|
+
============
|
840
|
+
|
841
|
+
On output:
|
842
|
+
|
843
|
+
If nrows >= ncolumns, then
|
844
|
+
|
845
|
+
u contains the coordinates with respect to the principal components;
|
846
|
+
v contains the principal component vectors.
|
847
|
+
|
848
|
+
The dot product u . v reproduces the data that were passed in u.
|
849
|
+
|
850
|
+
|
851
|
+
If nrows < ncolumns, then
|
852
|
+
|
853
|
+
u contains the principal component vectors;
|
854
|
+
v contains the coordinates with respect to the principal components.
|
855
|
+
|
856
|
+
The dot product v . u reproduces the data that were passed in u.
|
857
|
+
|
858
|
+
The eigenvalues of the covariance matrix are returned in w.
|
859
|
+
|
860
|
+
The arrays u, v, and w are sorted according to eigenvalue, with the largest
|
861
|
+
eigenvalues appearing first.
|
862
|
+
|
863
|
+
The function returns 0 if successful, -1 if memory allocation fails, and a
|
864
|
+
positive integer if the singular value decomposition fails to converge.
|
865
|
+
*/
|
866
|
+
{
|
867
|
+
int i;
|
868
|
+
int j;
|
869
|
+
int error;
|
870
|
+
int* index = malloc(ncolumns*sizeof(int));
|
871
|
+
double* temp = malloc(ncolumns*sizeof(double));
|
872
|
+
if (!index || !temp)
|
873
|
+
{ if (index) free(index);
|
874
|
+
if (temp) free(temp);
|
875
|
+
return -1;
|
876
|
+
}
|
877
|
+
error = svd(nrows, ncolumns, u, w, v);
|
878
|
+
if (error==0)
|
879
|
+
{
|
880
|
+
if (nrows >= ncolumns)
|
881
|
+
{ for (j = 0; j < ncolumns; j++)
|
882
|
+
{ const double s = w[j];
|
883
|
+
for (i = 0; i < nrows; i++) u[i][j] *= s;
|
884
|
+
}
|
885
|
+
sort(ncolumns, w, index);
|
886
|
+
for (i = 0; i < ncolumns/2; i++)
|
887
|
+
{ j = index[i];
|
888
|
+
index[i] = index[ncolumns-1-i];
|
889
|
+
index[ncolumns-1-i] = j;
|
890
|
+
}
|
891
|
+
for (i = 0; i < nrows; i++)
|
892
|
+
{ for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]];
|
893
|
+
for (j = 0; j < ncolumns; j++) u[i][j] = temp[j];
|
894
|
+
}
|
895
|
+
for (i = 0; i < ncolumns; i++)
|
896
|
+
{ for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i];
|
897
|
+
for (j = 0; j < ncolumns; j++) v[j][i] = temp[j];
|
898
|
+
}
|
899
|
+
for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]];
|
900
|
+
for (i = 0; i < ncolumns; i++) w[i] = temp[i];
|
901
|
+
}
|
902
|
+
else /* nrows < ncolumns */
|
903
|
+
{ for (j = 0; j < nrows; j++)
|
904
|
+
{ const double s = w[j];
|
905
|
+
for (i = 0; i < nrows; i++) v[i][j] *= s;
|
906
|
+
}
|
907
|
+
sort(nrows, w, index);
|
908
|
+
for (i = 0; i < nrows/2; i++)
|
909
|
+
{ j = index[i];
|
910
|
+
index[i] = index[nrows-1-i];
|
911
|
+
index[nrows-1-i] = j;
|
912
|
+
}
|
913
|
+
for (j = 0; j < ncolumns; j++)
|
914
|
+
{ for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j];
|
915
|
+
for (i = 0; i < nrows; i++) u[i][j] = temp[i];
|
916
|
+
}
|
917
|
+
for (j = 0; j < nrows; j++)
|
918
|
+
{ for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]];
|
919
|
+
for (i = 0; i < nrows; i++) v[j][i] = temp[i];
|
920
|
+
}
|
921
|
+
for (i = 0; i < nrows; i++) temp[i] = w[index[i]];
|
922
|
+
for (i = 0; i < nrows; i++) w[i] = temp[i];
|
923
|
+
}
|
924
|
+
}
|
925
|
+
free(index);
|
926
|
+
free(temp);
|
927
|
+
return error;
|
928
|
+
}
|
929
|
+
|
930
|
+
/* ********************************************************************* */
|
931
|
+
|
932
|
+
static
|
933
|
+
double euclid (int n, double** data1, double** data2, int** mask1, int** mask2,
|
934
|
+
const double weight[], int index1, int index2, int transpose)
|
935
|
+
|
936
|
+
/*
|
937
|
+
Purpose
|
938
|
+
=======
|
939
|
+
|
940
|
+
The euclid routine calculates the weighted Euclidean distance between two
|
941
|
+
rows or columns in a matrix.
|
942
|
+
|
943
|
+
Arguments
|
944
|
+
=========
|
945
|
+
|
946
|
+
n (input) int
|
947
|
+
The number of elements in a row or column. If transpose==0, then n is the number
|
948
|
+
of columns; otherwise, n is the number of rows.
|
949
|
+
|
950
|
+
data1 (input) double array
|
951
|
+
The data array containing the first vector.
|
952
|
+
|
953
|
+
data2 (input) double array
|
954
|
+
The data array containing the second vector.
|
955
|
+
|
956
|
+
mask1 (input) int array
|
957
|
+
This array which elements in data1 are missing. If mask1[i][j]==0, then
|
958
|
+
data1[i][j] is missing.
|
959
|
+
|
960
|
+
mask2 (input) int array
|
961
|
+
This array which elements in data2 are missing. If mask2[i][j]==0, then
|
962
|
+
data2[i][j] is missing.
|
963
|
+
|
964
|
+
weight (input) double[n]
|
965
|
+
The weights that are used to calculate the distance.
|
966
|
+
|
967
|
+
index1 (input) int
|
968
|
+
Index of the first row or column.
|
969
|
+
|
970
|
+
index2 (input) int
|
971
|
+
Index of the second row or column.
|
972
|
+
|
973
|
+
transpose (input) int
|
974
|
+
If transpose==0, the distance between two rows in the matrix is calculated.
|
975
|
+
Otherwise, the distance between two columns in the matrix is calculated.
|
976
|
+
|
977
|
+
============================================================================
|
978
|
+
*/
|
979
|
+
{ double result = 0.;
|
980
|
+
double tweight = 0;
|
981
|
+
int i;
|
982
|
+
if (transpose==0) /* Calculate the distance between two rows */
|
983
|
+
{ for (i = 0; i < n; i++)
|
984
|
+
{ if (mask1[index1][i] && mask2[index2][i])
|
985
|
+
{ double term = data1[index1][i] - data2[index2][i];
|
986
|
+
result += weight[i]*term*term;
|
987
|
+
tweight += weight[i];
|
988
|
+
}
|
989
|
+
}
|
990
|
+
}
|
991
|
+
else
|
992
|
+
{ for (i = 0; i < n; i++)
|
993
|
+
{ if (mask1[i][index1] && mask2[i][index2])
|
994
|
+
{ double term = data1[i][index1] - data2[i][index2];
|
995
|
+
result += weight[i]*term*term;
|
996
|
+
tweight += weight[i];
|
997
|
+
}
|
998
|
+
}
|
999
|
+
}
|
1000
|
+
if (!tweight) return 0; /* usually due to empty clusters */
|
1001
|
+
result /= tweight;
|
1002
|
+
return result;
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
/* ********************************************************************* */
|
1006
|
+
|
1007
|
+
static
|
1008
|
+
double cityblock (int n, double** data1, double** data2, int** mask1,
|
1009
|
+
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1010
|
+
|
1011
|
+
/*
|
1012
|
+
Purpose
|
1013
|
+
=======
|
1014
|
+
|
1015
|
+
The cityblock routine calculates the weighted "City Block" distance between
|
1016
|
+
two rows or columns in a matrix. City Block distance is defined as the
|
1017
|
+
absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
|
1018
|
+
equivalent to taking an "up and over" path.
|
1019
|
+
|
1020
|
+
Arguments
|
1021
|
+
=========
|
1022
|
+
|
1023
|
+
n (input) int
|
1024
|
+
The number of elements in a row or column. If transpose==0, then n is the number
|
1025
|
+
of columns; otherwise, n is the number of rows.
|
1026
|
+
|
1027
|
+
data1 (input) double array
|
1028
|
+
The data array containing the first vector.
|
1029
|
+
|
1030
|
+
data2 (input) double array
|
1031
|
+
The data array containing the second vector.
|
1032
|
+
|
1033
|
+
mask1 (input) int array
|
1034
|
+
This array which elements in data1 are missing. If mask1[i][j]==0, then
|
1035
|
+
data1[i][j] is missing.
|
1036
|
+
|
1037
|
+
mask2 (input) int array
|
1038
|
+
This array which elements in data2 are missing. If mask2[i][j]==0, then
|
1039
|
+
data2[i][j] is missing.
|
1040
|
+
|
1041
|
+
weight (input) double[n]
|
1042
|
+
The weights that are used to calculate the distance.
|
1043
|
+
|
1044
|
+
index1 (input) int
|
1045
|
+
Index of the first row or column.
|
1046
|
+
|
1047
|
+
index2 (input) int
|
1048
|
+
Index of the second row or column.
|
1049
|
+
|
1050
|
+
transpose (input) int
|
1051
|
+
If transpose==0, the distance between two rows in the matrix is calculated.
|
1052
|
+
Otherwise, the distance between two columns in the matrix is calculated.
|
1053
|
+
|
1054
|
+
============================================================================ */
|
1055
|
+
{ double result = 0.;
|
1056
|
+
double tweight = 0;
|
1057
|
+
int i;
|
1058
|
+
if (transpose==0) /* Calculate the distance between two rows */
|
1059
|
+
{ for (i = 0; i < n; i++)
|
1060
|
+
{ if (mask1[index1][i] && mask2[index2][i])
|
1061
|
+
{ double term = data1[index1][i] - data2[index2][i];
|
1062
|
+
result = result + weight[i]*fabs(term);
|
1063
|
+
tweight += weight[i];
|
1064
|
+
}
|
1065
|
+
}
|
1066
|
+
}
|
1067
|
+
else
|
1068
|
+
{ for (i = 0; i < n; i++)
|
1069
|
+
{ if (mask1[i][index1] && mask2[i][index2])
|
1070
|
+
{ double term = data1[i][index1] - data2[i][index2];
|
1071
|
+
result = result + weight[i]*fabs(term);
|
1072
|
+
tweight += weight[i];
|
1073
|
+
}
|
1074
|
+
}
|
1075
|
+
}
|
1076
|
+
if (!tweight) return 0; /* usually due to empty clusters */
|
1077
|
+
result /= tweight;
|
1078
|
+
return result;
|
1079
|
+
}
|
1080
|
+
|
1081
|
+
/* ********************************************************************* */
|
1082
|
+
|
1083
|
+
static
|
1084
|
+
double correlation (int n, double** data1, double** data2, int** mask1,
|
1085
|
+
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1086
|
+
/*
|
1087
|
+
Purpose
|
1088
|
+
=======
|
1089
|
+
|
1090
|
+
The correlation routine calculates the weighted Pearson distance between two
|
1091
|
+
rows or columns in a matrix. We define the Pearson distance as one minus the
|
1092
|
+
Pearson correlation.
|
1093
|
+
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
|
1094
|
+
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
|
1095
|
+
(e.g., choose b = a + c).
|
1096
|
+
|
1097
|
+
Arguments
|
1098
|
+
=========
|
1099
|
+
|
1100
|
+
n (input) int
|
1101
|
+
The number of elements in a row or column. If transpose==0, then n is the number
|
1102
|
+
of columns; otherwise, n is the number of rows.
|
1103
|
+
|
1104
|
+
data1 (input) double array
|
1105
|
+
The data array containing the first vector.
|
1106
|
+
|
1107
|
+
data2 (input) double array
|
1108
|
+
The data array containing the second vector.
|
1109
|
+
|
1110
|
+
mask1 (input) int array
|
1111
|
+
This array which elements in data1 are missing. If mask1[i][j]==0, then
|
1112
|
+
data1[i][j] is missing.
|
1113
|
+
|
1114
|
+
mask2 (input) int array
|
1115
|
+
This array which elements in data2 are missing. If mask2[i][j]==0, then
|
1116
|
+
data2[i][j] is missing.
|
1117
|
+
|
1118
|
+
weight (input) double[n]
|
1119
|
+
The weights that are used to calculate the distance.
|
1120
|
+
|
1121
|
+
index1 (input) int
|
1122
|
+
Index of the first row or column.
|
1123
|
+
|
1124
|
+
index2 (input) int
|
1125
|
+
Index of the second row or column.
|
1126
|
+
|
1127
|
+
transpose (input) int
|
1128
|
+
If transpose==0, the distance between two rows in the matrix is calculated.
|
1129
|
+
Otherwise, the distance between two columns in the matrix is calculated.
|
1130
|
+
============================================================================
|
1131
|
+
*/
|
1132
|
+
{ double result = 0.;
|
1133
|
+
double sum1 = 0.;
|
1134
|
+
double sum2 = 0.;
|
1135
|
+
double denom1 = 0.;
|
1136
|
+
double denom2 = 0.;
|
1137
|
+
double tweight = 0.;
|
1138
|
+
if (transpose==0) /* Calculate the distance between two rows */
|
1139
|
+
{ int i;
|
1140
|
+
for (i = 0; i < n; i++)
|
1141
|
+
{ if (mask1[index1][i] && mask2[index2][i])
|
1142
|
+
{ double term1 = data1[index1][i];
|
1143
|
+
double term2 = data2[index2][i];
|
1144
|
+
double w = weight[i];
|
1145
|
+
sum1 += w*term1;
|
1146
|
+
sum2 += w*term2;
|
1147
|
+
result += w*term1*term2;
|
1148
|
+
denom1 += w*term1*term1;
|
1149
|
+
denom2 += w*term2*term2;
|
1150
|
+
tweight += w;
|
1151
|
+
}
|
1152
|
+
}
|
1153
|
+
}
|
1154
|
+
else
|
1155
|
+
{ int i;
|
1156
|
+
for (i = 0; i < n; i++)
|
1157
|
+
{ if (mask1[i][index1] && mask2[i][index2])
|
1158
|
+
{ double term1 = data1[i][index1];
|
1159
|
+
double term2 = data2[i][index2];
|
1160
|
+
double w = weight[i];
|
1161
|
+
sum1 += w*term1;
|
1162
|
+
sum2 += w*term2;
|
1163
|
+
result += w*term1*term2;
|
1164
|
+
denom1 += w*term1*term1;
|
1165
|
+
denom2 += w*term2*term2;
|
1166
|
+
tweight += w;
|
1167
|
+
}
|
1168
|
+
}
|
1169
|
+
}
|
1170
|
+
if (!tweight) return 0; /* usually due to empty clusters */
|
1171
|
+
result -= sum1 * sum2 / tweight;
|
1172
|
+
denom1 -= sum1 * sum1 / tweight;
|
1173
|
+
denom2 -= sum2 * sum2 / tweight;
|
1174
|
+
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
|
1175
|
+
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
|
1176
|
+
result = result / sqrt(denom1*denom2);
|
1177
|
+
result = 1. - result;
|
1178
|
+
return result;
|
1179
|
+
}
|
1180
|
+
|
1181
|
+
/* ********************************************************************* */
|
1182
|
+
|
1183
|
+
static
|
1184
|
+
double acorrelation (int n, double** data1, double** data2, int** mask1,
|
1185
|
+
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1186
|
+
/*
|
1187
|
+
Purpose
|
1188
|
+
=======
|
1189
|
+
|
1190
|
+
The acorrelation routine calculates the weighted Pearson distance between two
|
1191
|
+
rows or columns, using the absolute value of the correlation.
|
1192
|
+
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
|
1193
|
+
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
|
1194
|
+
(e.g., choose b = a + c).
|
1195
|
+
|
1196
|
+
Arguments
|
1197
|
+
=========
|
1198
|
+
|
1199
|
+
n (input) int
|
1200
|
+
The number of elements in a row or column. If transpose==0, then n is the number
|
1201
|
+
of columns; otherwise, n is the number of rows.
|
1202
|
+
|
1203
|
+
data1 (input) double array
|
1204
|
+
The data array containing the first vector.
|
1205
|
+
|
1206
|
+
data2 (input) double array
|
1207
|
+
The data array containing the second vector.
|
1208
|
+
|
1209
|
+
mask1 (input) int array
|
1210
|
+
This array which elements in data1 are missing. If mask1[i][j]==0, then
|
1211
|
+
data1[i][j] is missing.
|
1212
|
+
|
1213
|
+
mask2 (input) int array
|
1214
|
+
This array which elements in data2 are missing. If mask2[i][j]==0, then
|
1215
|
+
data2[i][j] is missing.
|
1216
|
+
|
1217
|
+
weight (input) double[n]
|
1218
|
+
The weights that are used to calculate the distance.
|
1219
|
+
|
1220
|
+
index1 (input) int
|
1221
|
+
Index of the first row or column.
|
1222
|
+
|
1223
|
+
index2 (input) int
|
1224
|
+
Index of the second row or column.
|
1225
|
+
|
1226
|
+
transpose (input) int
|
1227
|
+
If transpose==0, the distance between two rows in the matrix is calculated.
|
1228
|
+
Otherwise, the distance between two columns in the matrix is calculated.
|
1229
|
+
============================================================================
|
1230
|
+
*/
|
1231
|
+
{ double result = 0.;
|
1232
|
+
double sum1 = 0.;
|
1233
|
+
double sum2 = 0.;
|
1234
|
+
double denom1 = 0.;
|
1235
|
+
double denom2 = 0.;
|
1236
|
+
double tweight = 0.;
|
1237
|
+
if (transpose==0) /* Calculate the distance between two rows */
|
1238
|
+
{ int i;
|
1239
|
+
for (i = 0; i < n; i++)
|
1240
|
+
{ if (mask1[index1][i] && mask2[index2][i])
|
1241
|
+
{ double term1 = data1[index1][i];
|
1242
|
+
double term2 = data2[index2][i];
|
1243
|
+
double w = weight[i];
|
1244
|
+
sum1 += w*term1;
|
1245
|
+
sum2 += w*term2;
|
1246
|
+
result += w*term1*term2;
|
1247
|
+
denom1 += w*term1*term1;
|
1248
|
+
denom2 += w*term2*term2;
|
1249
|
+
tweight += w;
|
1250
|
+
}
|
1251
|
+
}
|
1252
|
+
}
|
1253
|
+
else
|
1254
|
+
{ int i;
|
1255
|
+
for (i = 0; i < n; i++)
|
1256
|
+
{ if (mask1[i][index1] && mask2[i][index2])
|
1257
|
+
{ double term1 = data1[i][index1];
|
1258
|
+
double term2 = data2[i][index2];
|
1259
|
+
double w = weight[i];
|
1260
|
+
sum1 += w*term1;
|
1261
|
+
sum2 += w*term2;
|
1262
|
+
result += w*term1*term2;
|
1263
|
+
denom1 += w*term1*term1;
|
1264
|
+
denom2 += w*term2*term2;
|
1265
|
+
tweight += w;
|
1266
|
+
}
|
1267
|
+
}
|
1268
|
+
}
|
1269
|
+
if (!tweight) return 0; /* usually due to empty clusters */
|
1270
|
+
result -= sum1 * sum2 / tweight;
|
1271
|
+
denom1 -= sum1 * sum1 / tweight;
|
1272
|
+
denom2 -= sum2 * sum2 / tweight;
|
1273
|
+
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
|
1274
|
+
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
|
1275
|
+
result = fabs(result) / sqrt(denom1*denom2);
|
1276
|
+
result = 1. - result;
|
1277
|
+
return result;
|
1278
|
+
}
|
1279
|
+
|
1280
|
+
/* ********************************************************************* */
|
1281
|
+
|
1282
|
+
static
|
1283
|
+
double ucorrelation (int n, double** data1, double** data2, int** mask1,
|
1284
|
+
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1285
|
+
/*
|
1286
|
+
Purpose
|
1287
|
+
=======
|
1288
|
+
|
1289
|
+
The ucorrelation routine calculates the weighted Pearson distance between two
|
1290
|
+
rows or columns, using the uncentered version of the Pearson correlation. In the
|
1291
|
+
uncentered Pearson correlation, a zero mean is used for both vectors even if
|
1292
|
+
the actual mean is nonzero.
|
1293
|
+
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
|
1294
|
+
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
|
1295
|
+
(e.g., choose b = a + c).
|
1296
|
+
|
1297
|
+
Arguments
|
1298
|
+
=========
|
1299
|
+
|
1300
|
+
n (input) int
|
1301
|
+
The number of elements in a row or column. If transpose==0, then n is the number
|
1302
|
+
of columns; otherwise, n is the number of rows.
|
1303
|
+
|
1304
|
+
data1 (input) double array
|
1305
|
+
The data array containing the first vector.
|
1306
|
+
|
1307
|
+
data2 (input) double array
|
1308
|
+
The data array containing the second vector.
|
1309
|
+
|
1310
|
+
mask1 (input) int array
|
1311
|
+
This array which elements in data1 are missing. If mask1[i][j]==0, then
|
1312
|
+
data1[i][j] is missing.
|
1313
|
+
|
1314
|
+
mask2 (input) int array
|
1315
|
+
This array which elements in data2 are missing. If mask2[i][j]==0, then
|
1316
|
+
data2[i][j] is missing.
|
1317
|
+
|
1318
|
+
weight (input) double[n]
|
1319
|
+
The weights that are used to calculate the distance.
|
1320
|
+
|
1321
|
+
index1 (input) int
|
1322
|
+
Index of the first row or column.
|
1323
|
+
|
1324
|
+
index2 (input) int
|
1325
|
+
Index of the second row or column.
|
1326
|
+
|
1327
|
+
transpose (input) int
|
1328
|
+
If transpose==0, the distance between two rows in the matrix is calculated.
|
1329
|
+
Otherwise, the distance between two columns in the matrix is calculated.
|
1330
|
+
============================================================================
|
1331
|
+
*/
|
1332
|
+
{ double result = 0.;
|
1333
|
+
double denom1 = 0.;
|
1334
|
+
double denom2 = 0.;
|
1335
|
+
int flag = 0;
|
1336
|
+
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
|
1337
|
+
* found.
|
1338
|
+
*/
|
1339
|
+
if (transpose==0) /* Calculate the distance between two rows */
|
1340
|
+
{ int i;
|
1341
|
+
for (i = 0; i < n; i++)
|
1342
|
+
{ if (mask1[index1][i] && mask2[index2][i])
|
1343
|
+
{ double term1 = data1[index1][i];
|
1344
|
+
double term2 = data2[index2][i];
|
1345
|
+
double w = weight[i];
|
1346
|
+
result += w*term1*term2;
|
1347
|
+
denom1 += w*term1*term1;
|
1348
|
+
denom2 += w*term2*term2;
|
1349
|
+
flag = 1;
|
1350
|
+
}
|
1351
|
+
}
|
1352
|
+
}
|
1353
|
+
else
|
1354
|
+
{ int i;
|
1355
|
+
for (i = 0; i < n; i++)
|
1356
|
+
{ if (mask1[i][index1] && mask2[i][index2])
|
1357
|
+
{ double term1 = data1[i][index1];
|
1358
|
+
double term2 = data2[i][index2];
|
1359
|
+
double w = weight[i];
|
1360
|
+
result += w*term1*term2;
|
1361
|
+
denom1 += w*term1*term1;
|
1362
|
+
denom2 += w*term2*term2;
|
1363
|
+
flag = 1;
|
1364
|
+
}
|
1365
|
+
}
|
1366
|
+
}
|
1367
|
+
if (!flag) return 0.;
|
1368
|
+
if (denom1==0.) return 1.;
|
1369
|
+
if (denom2==0.) return 1.;
|
1370
|
+
result = result / sqrt(denom1*denom2);
|
1371
|
+
result = 1. - result;
|
1372
|
+
return result;
|
1373
|
+
}
|
1374
|
+
|
1375
|
+
/* ********************************************************************* */
|
1376
|
+
|
1377
|
+
static
|
1378
|
+
double uacorrelation (int n, double** data1, double** data2, int** mask1,
|
1379
|
+
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1380
|
+
/*
|
1381
|
+
Purpose
|
1382
|
+
=======
|
1383
|
+
|
1384
|
+
The uacorrelation routine calculates the weighted Pearson distance between two
|
1385
|
+
rows or columns, using the absolute value of the uncentered version of the
|
1386
|
+
Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
|
1387
|
+
for both vectors even if the actual mean is nonzero.
|
1388
|
+
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
|
1389
|
+
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
|
1390
|
+
(e.g., choose b = a + c).
|
1391
|
+
|
1392
|
+
Arguments
|
1393
|
+
=========
|
1394
|
+
|
1395
|
+
n (input) int
|
1396
|
+
The number of elements in a row or column. If transpose==0, then n is the number
|
1397
|
+
of columns; otherwise, n is the number of rows.
|
1398
|
+
|
1399
|
+
data1 (input) double array
|
1400
|
+
The data array containing the first vector.
|
1401
|
+
|
1402
|
+
data2 (input) double array
|
1403
|
+
The data array containing the second vector.
|
1404
|
+
|
1405
|
+
mask1 (input) int array
|
1406
|
+
This array which elements in data1 are missing. If mask1[i][j]==0, then
|
1407
|
+
data1[i][j] is missing.
|
1408
|
+
|
1409
|
+
mask2 (input) int array
|
1410
|
+
This array which elements in data2 are missing. If mask2[i][j]==0, then
|
1411
|
+
data2[i][j] is missing.
|
1412
|
+
|
1413
|
+
weight (input) double[n]
|
1414
|
+
The weights that are used to calculate the distance.
|
1415
|
+
|
1416
|
+
index1 (input) int
|
1417
|
+
Index of the first row or column.
|
1418
|
+
|
1419
|
+
index2 (input) int
|
1420
|
+
Index of the second row or column.
|
1421
|
+
|
1422
|
+
transpose (input) int
|
1423
|
+
If transpose==0, the distance between two rows in the matrix is calculated.
|
1424
|
+
Otherwise, the distance between two columns in the matrix is calculated.
|
1425
|
+
============================================================================
|
1426
|
+
*/
|
1427
|
+
{ double result = 0.;
|
1428
|
+
double denom1 = 0.;
|
1429
|
+
double denom2 = 0.;
|
1430
|
+
int flag = 0;
|
1431
|
+
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
|
1432
|
+
* found.
|
1433
|
+
*/
|
1434
|
+
if (transpose==0) /* Calculate the distance between two rows */
|
1435
|
+
{ int i;
|
1436
|
+
for (i = 0; i < n; i++)
|
1437
|
+
{ if (mask1[index1][i] && mask2[index2][i])
|
1438
|
+
{ double term1 = data1[index1][i];
|
1439
|
+
double term2 = data2[index2][i];
|
1440
|
+
double w = weight[i];
|
1441
|
+
result += w*term1*term2;
|
1442
|
+
denom1 += w*term1*term1;
|
1443
|
+
denom2 += w*term2*term2;
|
1444
|
+
flag = 1;
|
1445
|
+
}
|
1446
|
+
}
|
1447
|
+
}
|
1448
|
+
else
|
1449
|
+
{ int i;
|
1450
|
+
for (i = 0; i < n; i++)
|
1451
|
+
{ if (mask1[i][index1] && mask2[i][index2])
|
1452
|
+
{ double term1 = data1[i][index1];
|
1453
|
+
double term2 = data2[i][index2];
|
1454
|
+
double w = weight[i];
|
1455
|
+
result += w*term1*term2;
|
1456
|
+
denom1 += w*term1*term1;
|
1457
|
+
denom2 += w*term2*term2;
|
1458
|
+
flag = 1;
|
1459
|
+
}
|
1460
|
+
}
|
1461
|
+
}
|
1462
|
+
if (!flag) return 0.;
|
1463
|
+
if (denom1==0.) return 1.;
|
1464
|
+
if (denom2==0.) return 1.;
|
1465
|
+
result = fabs(result) / sqrt(denom1*denom2);
|
1466
|
+
result = 1. - result;
|
1467
|
+
return result;
|
1468
|
+
}
|
1469
|
+
|
1470
|
+
/* ********************************************************************* */
|
1471
|
+
|
1472
|
+
static
|
1473
|
+
double spearman (int n, double** data1, double** data2, int** mask1,
|
1474
|
+
int** mask2, const double weight[], int index1, int index2, int transpose)
|
1475
|
+
/*
|
1476
|
+
Purpose
|
1477
|
+
=======
|
1478
|
+
|
1479
|
+
The spearman routine calculates the Spearman distance between two rows or
|
1480
|
+
columns. The Spearman distance is defined as one minus the Spearman rank
|
1481
|
+
correlation.
|
1482
|
+
|
1483
|
+
Arguments
|
1484
|
+
=========
|
1485
|
+
|
1486
|
+
n (input) int
|
1487
|
+
The number of elements in a row or column. If transpose==0, then n is the number
|
1488
|
+
of columns; otherwise, n is the number of rows.
|
1489
|
+
|
1490
|
+
data1 (input) double array
|
1491
|
+
The data array containing the first vector.
|
1492
|
+
|
1493
|
+
data2 (input) double array
|
1494
|
+
The data array containing the second vector.
|
1495
|
+
|
1496
|
+
mask1 (input) int array
|
1497
|
+
This array which elements in data1 are missing. If mask1[i][j]==0, then
|
1498
|
+
data1[i][j] is missing.
|
1499
|
+
|
1500
|
+
mask2 (input) int array
|
1501
|
+
This array which elements in data2 are missing. If mask2[i][j]==0, then
|
1502
|
+
data2[i][j] is missing.
|
1503
|
+
|
1504
|
+
weight (input) double[n]
|
1505
|
+
These weights are ignored, but included for consistency with other distance
|
1506
|
+
measures.
|
1507
|
+
|
1508
|
+
index1 (input) int
|
1509
|
+
Index of the first row or column.
|
1510
|
+
|
1511
|
+
index2 (input) int
|
1512
|
+
Index of the second row or column.
|
1513
|
+
|
1514
|
+
transpose (input) int
|
1515
|
+
If transpose==0, the distance between two rows in the matrix is calculated.
|
1516
|
+
Otherwise, the distance between two columns in the matrix is calculated.
|
1517
|
+
============================================================================
|
1518
|
+
*/
|
1519
|
+
{ int i;
|
1520
|
+
int m = 0;
|
1521
|
+
double* rank1;
|
1522
|
+
double* rank2;
|
1523
|
+
double result = 0.;
|
1524
|
+
double denom1 = 0.;
|
1525
|
+
double denom2 = 0.;
|
1526
|
+
double avgrank;
|
1527
|
+
double* tdata1;
|
1528
|
+
double* tdata2;
|
1529
|
+
tdata1 = malloc(n*sizeof(double));
|
1530
|
+
if(!tdata1) return 0.0; /* Memory allocation error */
|
1531
|
+
tdata2 = malloc(n*sizeof(double));
|
1532
|
+
if(!tdata2) /* Memory allocation error */
|
1533
|
+
{ free(tdata1);
|
1534
|
+
return 0.0;
|
1535
|
+
}
|
1536
|
+
if (transpose==0)
|
1537
|
+
{ for (i = 0; i < n; i++)
|
1538
|
+
{ if (mask1[index1][i] && mask2[index2][i])
|
1539
|
+
{ tdata1[m] = data1[index1][i];
|
1540
|
+
tdata2[m] = data2[index2][i];
|
1541
|
+
m++;
|
1542
|
+
}
|
1543
|
+
}
|
1544
|
+
}
|
1545
|
+
else
|
1546
|
+
{ for (i = 0; i < n; i++)
|
1547
|
+
{ if (mask1[i][index1] && mask2[i][index2])
|
1548
|
+
{ tdata1[m] = data1[i][index1];
|
1549
|
+
tdata2[m] = data2[i][index2];
|
1550
|
+
m++;
|
1551
|
+
}
|
1552
|
+
}
|
1553
|
+
}
|
1554
|
+
if (m==0)
|
1555
|
+
{ free(tdata1);
|
1556
|
+
free(tdata2);
|
1557
|
+
return 0;
|
1558
|
+
}
|
1559
|
+
rank1 = getrank(m, tdata1);
|
1560
|
+
free(tdata1);
|
1561
|
+
if(!rank1)
|
1562
|
+
{ free(tdata2);
|
1563
|
+
return 0.0; /* Memory allocation error */
|
1564
|
+
}
|
1565
|
+
rank2 = getrank(m, tdata2);
|
1566
|
+
free(tdata2);
|
1567
|
+
if(!rank2) /* Memory allocation error */
|
1568
|
+
{ free(rank1);
|
1569
|
+
return 0.0;
|
1570
|
+
}
|
1571
|
+
avgrank = 0.5*(m-1); /* Average rank */
|
1572
|
+
for (i = 0; i < m; i++)
|
1573
|
+
{ const double value1 = rank1[i];
|
1574
|
+
const double value2 = rank2[i];
|
1575
|
+
result += value1 * value2;
|
1576
|
+
denom1 += value1 * value1;
|
1577
|
+
denom2 += value2 * value2;
|
1578
|
+
}
|
1579
|
+
/* Note: denom1 and denom2 cannot be calculated directly from the number
|
1580
|
+
* of elements. If two elements have the same rank, the squared sum of
|
1581
|
+
* their ranks will change.
|
1582
|
+
*/
|
1583
|
+
free(rank1);
|
1584
|
+
free(rank2);
|
1585
|
+
result /= m;
|
1586
|
+
denom1 /= m;
|
1587
|
+
denom2 /= m;
|
1588
|
+
result -= avgrank * avgrank;
|
1589
|
+
denom1 -= avgrank * avgrank;
|
1590
|
+
denom2 -= avgrank * avgrank;
|
1591
|
+
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
|
1592
|
+
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
|
1593
|
+
result = result / sqrt(denom1*denom2);
|
1594
|
+
result = 1. - result;
|
1595
|
+
return result;
|
1596
|
+
}
|
1597
|
+
|
1598
|
+
/* ********************************************************************* */
|
1599
|
+
|
1600
|
+
static
|
1601
|
+
double kendall (int n, double** data1, double** data2, int** mask1, int** mask2,
|
1602
|
+
const double weight[], int index1, int index2, int transpose)
|
1603
|
+
/*
|
1604
|
+
Purpose
|
1605
|
+
=======
|
1606
|
+
|
1607
|
+
The kendall routine calculates the Kendall distance between two
|
1608
|
+
rows or columns. The Kendall distance is defined as one minus Kendall's tau.
|
1609
|
+
|
1610
|
+
Arguments
|
1611
|
+
=========
|
1612
|
+
|
1613
|
+
n (input) int
|
1614
|
+
The number of elements in a row or column. If transpose==0, then n is the number
|
1615
|
+
of columns; otherwise, n is the number of rows.
|
1616
|
+
|
1617
|
+
data1 (input) double array
|
1618
|
+
The data array containing the first vector.
|
1619
|
+
|
1620
|
+
data2 (input) double array
|
1621
|
+
The data array containing the second vector.
|
1622
|
+
|
1623
|
+
mask1 (input) int array
|
1624
|
+
This array which elements in data1 are missing. If mask1[i][j]==0, then
|
1625
|
+
data1[i][j] is missing.
|
1626
|
+
|
1627
|
+
mask2 (input) int array
|
1628
|
+
This array which elements in data2 are missing. If mask2[i][j]==0, then
|
1629
|
+
data2[i][j] is missing.
|
1630
|
+
|
1631
|
+
weight (input) double[n]
|
1632
|
+
These weights are ignored, but included for consistency with other distance
|
1633
|
+
measures.
|
1634
|
+
|
1635
|
+
index1 (input) int
|
1636
|
+
Index of the first row or column.
|
1637
|
+
|
1638
|
+
index2 (input) int
|
1639
|
+
Index of the second row or column.
|
1640
|
+
|
1641
|
+
transpose (input) int
|
1642
|
+
If transpose==0, the distance between two rows in the matrix is calculated.
|
1643
|
+
Otherwise, the distance between two columns in the matrix is calculated.
|
1644
|
+
============================================================================
|
1645
|
+
*/
|
1646
|
+
{ int con = 0;
|
1647
|
+
int dis = 0;
|
1648
|
+
int exx = 0;
|
1649
|
+
int exy = 0;
|
1650
|
+
int flag = 0;
|
1651
|
+
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are
|
1652
|
+
* found.
|
1653
|
+
*/
|
1654
|
+
double denomx;
|
1655
|
+
double denomy;
|
1656
|
+
double tau;
|
1657
|
+
int i, j;
|
1658
|
+
if (transpose==0)
|
1659
|
+
{ for (i = 0; i < n; i++)
|
1660
|
+
{ if (mask1[index1][i] && mask2[index2][i])
|
1661
|
+
{ for (j = 0; j < i; j++)
|
1662
|
+
{ if (mask1[index1][j] && mask2[index2][j])
|
1663
|
+
{ double x1 = data1[index1][i];
|
1664
|
+
double x2 = data1[index1][j];
|
1665
|
+
double y1 = data2[index2][i];
|
1666
|
+
double y2 = data2[index2][j];
|
1667
|
+
if (x1 < x2 && y1 < y2) con++;
|
1668
|
+
if (x1 > x2 && y1 > y2) con++;
|
1669
|
+
if (x1 < x2 && y1 > y2) dis++;
|
1670
|
+
if (x1 > x2 && y1 < y2) dis++;
|
1671
|
+
if (x1 == x2 && y1 != y2) exx++;
|
1672
|
+
if (x1 != x2 && y1 == y2) exy++;
|
1673
|
+
flag = 1;
|
1674
|
+
}
|
1675
|
+
}
|
1676
|
+
}
|
1677
|
+
}
|
1678
|
+
}
|
1679
|
+
else
|
1680
|
+
{ for (i = 0; i < n; i++)
|
1681
|
+
{ if (mask1[i][index1] && mask2[i][index2])
|
1682
|
+
{ for (j = 0; j < i; j++)
|
1683
|
+
{ if (mask1[j][index1] && mask2[j][index2])
|
1684
|
+
{ double x1 = data1[i][index1];
|
1685
|
+
double x2 = data1[j][index1];
|
1686
|
+
double y1 = data2[i][index2];
|
1687
|
+
double y2 = data2[j][index2];
|
1688
|
+
if (x1 < x2 && y1 < y2) con++;
|
1689
|
+
if (x1 > x2 && y1 > y2) con++;
|
1690
|
+
if (x1 < x2 && y1 > y2) dis++;
|
1691
|
+
if (x1 > x2 && y1 < y2) dis++;
|
1692
|
+
if (x1 == x2 && y1 != y2) exx++;
|
1693
|
+
if (x1 != x2 && y1 == y2) exy++;
|
1694
|
+
flag = 1;
|
1695
|
+
}
|
1696
|
+
}
|
1697
|
+
}
|
1698
|
+
}
|
1699
|
+
}
|
1700
|
+
if (!flag) return 0.;
|
1701
|
+
denomx = con + dis + exx;
|
1702
|
+
denomy = con + dis + exy;
|
1703
|
+
if (denomx==0) return 1;
|
1704
|
+
if (denomy==0) return 1;
|
1705
|
+
tau = (con-dis)/sqrt(denomx*denomy);
|
1706
|
+
return 1.-tau;
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
/* ********************************************************************* */
|
1710
|
+
|
1711
|
+
static double(*setmetric(char dist))
|
1712
|
+
(int, double**, double**, int**, int**, const double[], int, int, int)
|
1713
|
+
{ switch(dist)
|
1714
|
+
{ case 'e': return &euclid;
|
1715
|
+
case 'b': return &cityblock;
|
1716
|
+
case 'c': return &correlation;
|
1717
|
+
case 'a': return &acorrelation;
|
1718
|
+
case 'u': return &ucorrelation;
|
1719
|
+
case 'x': return &uacorrelation;
|
1720
|
+
case 's': return &spearman;
|
1721
|
+
case 'k': return &kendall;
|
1722
|
+
default: return &euclid;
|
1723
|
+
}
|
1724
|
+
return NULL; /* Never get here */
|
1725
|
+
}
|
1726
|
+
|
1727
|
+
/* ********************************************************************* */
|
1728
|
+
|
1729
|
+
static double uniform(void)
|
1730
|
+
/*
|
1731
|
+
Purpose
|
1732
|
+
=======
|
1733
|
+
|
1734
|
+
This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
|
1735
|
+
and 1.0 are excluded. This random number generator is described in:
|
1736
|
+
|
1737
|
+
Pierre l'Ecuyer
|
1738
|
+
Efficient and Portable Combined Random Number Generators
|
1739
|
+
Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749,774.
|
1740
|
+
|
1741
|
+
The first time this routine is called, it initializes the random number
|
1742
|
+
generator using the current time. First, the current epoch time in seconds is
|
1743
|
+
used as a seed for the random number generator in the C library. The first two
|
1744
|
+
random numbers generated by this generator are used to initialize the random
|
1745
|
+
number generator implemented in this routine.
|
1746
|
+
|
1747
|
+
|
1748
|
+
Arguments
|
1749
|
+
=========
|
1750
|
+
|
1751
|
+
None.
|
1752
|
+
|
1753
|
+
|
1754
|
+
Return value
|
1755
|
+
============
|
1756
|
+
|
1757
|
+
A double-precison number between 0.0 and 1.0.
|
1758
|
+
============================================================================
|
1759
|
+
*/
|
1760
|
+
{ int z;
|
1761
|
+
static const int m1 = 2147483563;
|
1762
|
+
static const int m2 = 2147483399;
|
1763
|
+
const double scale = 1.0/m1;
|
1764
|
+
|
1765
|
+
static int s1 = 0;
|
1766
|
+
static int s2 = 0;
|
1767
|
+
|
1768
|
+
if (s1==0 || s2==0) /* initialize */
|
1769
|
+
{ unsigned int initseed = (unsigned int) time(0);
|
1770
|
+
srand(initseed);
|
1771
|
+
s1 = rand();
|
1772
|
+
s2 = rand();
|
1773
|
+
}
|
1774
|
+
|
1775
|
+
do
|
1776
|
+
{ int k;
|
1777
|
+
k = s1/53668;
|
1778
|
+
s1 = 40014*(s1-k*53668)-k*12211;
|
1779
|
+
if (s1 < 0) s1+=m1;
|
1780
|
+
k = s2/52774;
|
1781
|
+
s2 = 40692*(s2-k*52774)-k*3791;
|
1782
|
+
if(s2 < 0) s2+=m2;
|
1783
|
+
z = s1-s2;
|
1784
|
+
if(z < 1) z+=(m1-1);
|
1785
|
+
} while (z==m1); /* To avoid returning 1.0 */
|
1786
|
+
|
1787
|
+
return z*scale;
|
1788
|
+
}
|
1789
|
+
|
1790
|
+
/* ************************************************************************ */
|
1791
|
+
|
1792
|
+
static int binomial(int n, double p)
|
1793
|
+
/*
|
1794
|
+
Purpose
|
1795
|
+
=======
|
1796
|
+
|
1797
|
+
This routine generates a random number between 0 and n inclusive, following
|
1798
|
+
the binomial distribution with probability p and n trials. The routine is
|
1799
|
+
based on the BTPE algorithm, described in:
|
1800
|
+
|
1801
|
+
Voratas Kachitvichyanukul and Bruce W. Schmeiser:
|
1802
|
+
Binomial Random Variate Generation
|
1803
|
+
Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
|
1804
|
+
|
1805
|
+
|
1806
|
+
Arguments
|
1807
|
+
=========
|
1808
|
+
|
1809
|
+
p (input) double
|
1810
|
+
The probability of a single event. This probability should be less than or
|
1811
|
+
equal to 0.5.
|
1812
|
+
|
1813
|
+
n (input) int
|
1814
|
+
The number of trials.
|
1815
|
+
|
1816
|
+
|
1817
|
+
Return value
|
1818
|
+
============
|
1819
|
+
|
1820
|
+
An integer drawn from a binomial distribution with parameters (p, n).
|
1821
|
+
|
1822
|
+
============================================================================
|
1823
|
+
*/
|
1824
|
+
{ const double q = 1 - p;
|
1825
|
+
if (n*p < 30.0) /* Algorithm BINV */
|
1826
|
+
{ const double s = p/q;
|
1827
|
+
const double a = (n+1)*s;
|
1828
|
+
double r = exp(n*log(q)); /* pow() causes a crash on AIX */
|
1829
|
+
int x = 0;
|
1830
|
+
double u = uniform();
|
1831
|
+
while(1)
|
1832
|
+
{ if (u < r) return x;
|
1833
|
+
u-=r;
|
1834
|
+
x++;
|
1835
|
+
r *= (a/x)-s;
|
1836
|
+
}
|
1837
|
+
}
|
1838
|
+
else /* Algorithm BTPE */
|
1839
|
+
{ /* Step 0 */
|
1840
|
+
const double fm = n*p + p;
|
1841
|
+
const int m = (int) fm;
|
1842
|
+
const double p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5;
|
1843
|
+
const double xm = m + 0.5;
|
1844
|
+
const double xl = xm - p1;
|
1845
|
+
const double xr = xm + p1;
|
1846
|
+
const double c = 0.134 + 20.5/(15.3+m);
|
1847
|
+
const double a = (fm-xl)/(fm-xl*p);
|
1848
|
+
const double b = (xr-fm)/(xr*q);
|
1849
|
+
const double lambdal = a*(1.0+0.5*a);
|
1850
|
+
const double lambdar = b*(1.0+0.5*b);
|
1851
|
+
const double p2 = p1*(1+2*c);
|
1852
|
+
const double p3 = p2 + c/lambdal;
|
1853
|
+
const double p4 = p3 + c/lambdar;
|
1854
|
+
while (1)
|
1855
|
+
{ /* Step 1 */
|
1856
|
+
int y;
|
1857
|
+
int k;
|
1858
|
+
double u = uniform();
|
1859
|
+
double v = uniform();
|
1860
|
+
u *= p4;
|
1861
|
+
if (u <= p1) return (int)(xm-p1*v+u);
|
1862
|
+
/* Step 2 */
|
1863
|
+
if (u > p2)
|
1864
|
+
{ /* Step 3 */
|
1865
|
+
if (u > p3)
|
1866
|
+
{ /* Step 4 */
|
1867
|
+
y = (int)(xr-log(v)/lambdar);
|
1868
|
+
if (y > n) continue;
|
1869
|
+
/* Go to step 5 */
|
1870
|
+
v = v*(u-p3)*lambdar;
|
1871
|
+
}
|
1872
|
+
else
|
1873
|
+
{ y = (int)(xl+log(v)/lambdal);
|
1874
|
+
if (y < 0) continue;
|
1875
|
+
/* Go to step 5 */
|
1876
|
+
v = v*(u-p2)*lambdal;
|
1877
|
+
}
|
1878
|
+
}
|
1879
|
+
else
|
1880
|
+
{ const double x = xl + (u-p1)/c;
|
1881
|
+
v = v*c + 1.0 - fabs(m-x+0.5)/p1;
|
1882
|
+
if (v > 1) continue;
|
1883
|
+
/* Go to step 5 */
|
1884
|
+
y = (int)x;
|
1885
|
+
}
|
1886
|
+
/* Step 5 */
|
1887
|
+
/* Step 5.0 */
|
1888
|
+
k = abs(y-m);
|
1889
|
+
if (k > 20 && k < 0.5*n*p*q-1.0)
|
1890
|
+
{ /* Step 5.2 */
|
1891
|
+
double rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) + 0.1666666666666)/(n*p*q)+0.5);
|
1892
|
+
double t = -k*k/(2*n*p*q);
|
1893
|
+
double A = log(v);
|
1894
|
+
if (A < t-rho) return y;
|
1895
|
+
else if (A > t+rho) continue;
|
1896
|
+
else
|
1897
|
+
{ /* Step 5.3 */
|
1898
|
+
double x1 = y+1;
|
1899
|
+
double f1 = m+1;
|
1900
|
+
double z = n+1-m;
|
1901
|
+
double w = n-y+1;
|
1902
|
+
double x2 = x1*x1;
|
1903
|
+
double f2 = f1*f1;
|
1904
|
+
double z2 = z*z;
|
1905
|
+
double w2 = w*w;
|
1906
|
+
if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
|
1907
|
+
+ (y-m)*log(w*p/(x1*q))
|
1908
|
+
+ (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
|
1909
|
+
+ (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
|
1910
|
+
+ (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
|
1911
|
+
+ (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
|
1912
|
+
continue;
|
1913
|
+
return y;
|
1914
|
+
}
|
1915
|
+
}
|
1916
|
+
else
|
1917
|
+
{ /* Step 5.1 */
|
1918
|
+
int i;
|
1919
|
+
const double s = p/q;
|
1920
|
+
const double aa = s*(n+1);
|
1921
|
+
double f = 1.0;
|
1922
|
+
for (i = m; i < y; f *= (aa/(++i)-s));
|
1923
|
+
for (i = y; i < m; f /= (aa/(++i)-s));
|
1924
|
+
if (v > f) continue;
|
1925
|
+
return y;
|
1926
|
+
}
|
1927
|
+
}
|
1928
|
+
}
|
1929
|
+
/* Never get here */
|
1930
|
+
return -1;
|
1931
|
+
}
|
1932
|
+
|
1933
|
+
/* ************************************************************************ */
|
1934
|
+
|
1935
|
+
static void randomassign (int nclusters, int nelements, int clusterid[])
|
1936
|
+
/*
|
1937
|
+
Purpose
|
1938
|
+
=======
|
1939
|
+
|
1940
|
+
The randomassign routine performs an initial random clustering, needed for
|
1941
|
+
k-means or k-median clustering. Elements (genes or microarrays) are randomly
|
1942
|
+
assigned to clusters. The number of elements in each cluster is chosen
|
1943
|
+
randomly, making sure that each cluster will receive at least one element.
|
1944
|
+
|
1945
|
+
|
1946
|
+
Arguments
|
1947
|
+
=========
|
1948
|
+
|
1949
|
+
nclusters (input) int
|
1950
|
+
The number of clusters.
|
1951
|
+
|
1952
|
+
nelements (input) int
|
1953
|
+
The number of elements to be clustered (i.e., the number of genes or microarrays
|
1954
|
+
to be clustered).
|
1955
|
+
|
1956
|
+
clusterid (output) int[nelements]
|
1957
|
+
The cluster number to which an element was assigned.
|
1958
|
+
|
1959
|
+
============================================================================
|
1960
|
+
*/
|
1961
|
+
{ int i, j;
|
1962
|
+
int k = 0;
|
1963
|
+
double p;
|
1964
|
+
int n = nelements-nclusters;
|
1965
|
+
/* Draw the number of elements in each cluster from a multinomial
|
1966
|
+
* distribution, reserving ncluster elements to set independently
|
1967
|
+
* in order to guarantee that none of the clusters are empty.
|
1968
|
+
*/
|
1969
|
+
for (i = 0; i < nclusters-1; i++)
|
1970
|
+
{ p = 1.0/(nclusters-i);
|
1971
|
+
j = binomial(n, p);
|
1972
|
+
n -= j;
|
1973
|
+
j += k+1; /* Assign at least one element to cluster i */
|
1974
|
+
for ( ; k < j; k++) clusterid[k] = i;
|
1975
|
+
}
|
1976
|
+
/* Assign the remaining elements to the last cluster */
|
1977
|
+
for ( ; k < nelements; k++) clusterid[k] = i;
|
1978
|
+
|
1979
|
+
/* Create a random permutation of the cluster assignments */
|
1980
|
+
for (i = 0; i < nelements; i++)
|
1981
|
+
{ j = (int) (i + (nelements-i)*uniform());
|
1982
|
+
k = clusterid[j];
|
1983
|
+
clusterid[j] = clusterid[i];
|
1984
|
+
clusterid[i] = k;
|
1985
|
+
}
|
1986
|
+
|
1987
|
+
return;
|
1988
|
+
}
|
1989
|
+
|
1990
|
+
/* ********************************************************************* */
|
1991
|
+
|
1992
|
+
static void getclustermeans(int nclusters, int nrows, int ncolumns,
|
1993
|
+
double** data, int** mask, int clusterid[], double** cdata, int** cmask,
|
1994
|
+
int transpose)
|
1995
|
+
/*
|
1996
|
+
Purpose
|
1997
|
+
=======
|
1998
|
+
|
1999
|
+
The getclustermeans routine calculates the cluster centroids, given to which
|
2000
|
+
cluster each element belongs. The centroid is defined as the mean over all
|
2001
|
+
elements for each dimension.
|
2002
|
+
|
2003
|
+
Arguments
|
2004
|
+
=========
|
2005
|
+
|
2006
|
+
nclusters (input) int
|
2007
|
+
The number of clusters.
|
2008
|
+
|
2009
|
+
nrows (input) int
|
2010
|
+
The number of rows in the gene expression data matrix, equal to the number of
|
2011
|
+
genes.
|
2012
|
+
|
2013
|
+
ncolumns (input) int
|
2014
|
+
The number of columns in the gene expression data matrix, equal to the number of
|
2015
|
+
microarrays.
|
2016
|
+
|
2017
|
+
data (input) double[nrows][ncolumns]
|
2018
|
+
The array containing the gene expression data.
|
2019
|
+
|
2020
|
+
mask (input) int[nrows][ncolumns]
|
2021
|
+
This array shows which data values are missing. If mask[i][j]==0, then
|
2022
|
+
data[i][j] is missing.
|
2023
|
+
|
2024
|
+
clusterid (output) int[nrows] if transpose==0
|
2025
|
+
int[ncolumns] if transpose==1
|
2026
|
+
The cluster number to which each element belongs. If transpose==0, then the
|
2027
|
+
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
|
2028
|
+
is equal to ncolumns (the number of microarrays).
|
2029
|
+
|
2030
|
+
cdata (output) double[nclusters][ncolumns] if transpose==0
|
2031
|
+
double[nrows][nclusters] if transpose==1
|
2032
|
+
On exit of getclustermeans, this array contains the cluster centroids.
|
2033
|
+
|
2034
|
+
cmask (output) int[nclusters][ncolumns] if transpose==0
|
2035
|
+
int[nrows][nclusters] if transpose==1
|
2036
|
+
This array shows which data values of are missing for each centroid. If
|
2037
|
+
cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
|
2038
|
+
a centroid if all corresponding data values of the cluster members are missing.
|
2039
|
+
|
2040
|
+
transpose (input) int
|
2041
|
+
If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
|
2042
|
+
columns (microarrays) are specified.
|
2043
|
+
|
2044
|
+
========================================================================
|
2045
|
+
*/
|
2046
|
+
{ int i, j, k;
|
2047
|
+
if (transpose==0)
|
2048
|
+
{ for (i = 0; i < nclusters; i++)
|
2049
|
+
{ for (j = 0; j < ncolumns; j++)
|
2050
|
+
{ cmask[i][j] = 0;
|
2051
|
+
cdata[i][j] = 0.;
|
2052
|
+
}
|
2053
|
+
}
|
2054
|
+
for (k = 0; k < nrows; k++)
|
2055
|
+
{ i = clusterid[k];
|
2056
|
+
for (j = 0; j < ncolumns; j++)
|
2057
|
+
{ if (mask[k][j] != 0)
|
2058
|
+
{ cdata[i][j]+=data[k][j];
|
2059
|
+
cmask[i][j]++;
|
2060
|
+
}
|
2061
|
+
}
|
2062
|
+
}
|
2063
|
+
for (i = 0; i < nclusters; i++)
|
2064
|
+
{ for (j = 0; j < ncolumns; j++)
|
2065
|
+
{ if (cmask[i][j]>0)
|
2066
|
+
{ cdata[i][j] /= cmask[i][j];
|
2067
|
+
cmask[i][j] = 1;
|
2068
|
+
}
|
2069
|
+
}
|
2070
|
+
}
|
2071
|
+
}
|
2072
|
+
else
|
2073
|
+
{ for (i = 0; i < nrows; i++)
|
2074
|
+
{ for (j = 0; j < nclusters; j++)
|
2075
|
+
{ cdata[i][j] = 0.;
|
2076
|
+
cmask[i][j] = 0;
|
2077
|
+
}
|
2078
|
+
}
|
2079
|
+
for (k = 0; k < ncolumns; k++)
|
2080
|
+
{ i = clusterid[k];
|
2081
|
+
for (j = 0; j < nrows; j++)
|
2082
|
+
{ if (mask[j][k] != 0)
|
2083
|
+
{ cdata[j][i]+=data[j][k];
|
2084
|
+
cmask[j][i]++;
|
2085
|
+
}
|
2086
|
+
}
|
2087
|
+
}
|
2088
|
+
for (i = 0; i < nrows; i++)
|
2089
|
+
{ for (j = 0; j < nclusters; j++)
|
2090
|
+
{ if (cmask[i][j]>0)
|
2091
|
+
{ cdata[i][j] /= cmask[i][j];
|
2092
|
+
cmask[i][j] = 1;
|
2093
|
+
}
|
2094
|
+
}
|
2095
|
+
}
|
2096
|
+
}
|
2097
|
+
}
|
2098
|
+
|
2099
|
+
/* ********************************************************************* */
|
2100
|
+
|
2101
|
+
static void
|
2102
|
+
getclustermedians(int nclusters, int nrows, int ncolumns,
|
2103
|
+
double** data, int** mask, int clusterid[], double** cdata, int** cmask,
|
2104
|
+
int transpose, double cache[])
|
2105
|
+
/*
|
2106
|
+
Purpose
|
2107
|
+
=======
|
2108
|
+
|
2109
|
+
The getclustermedians routine calculates the cluster centroids, given to which
|
2110
|
+
cluster each element belongs. The centroid is defined as the median over all
|
2111
|
+
elements for each dimension.
|
2112
|
+
|
2113
|
+
Arguments
|
2114
|
+
=========
|
2115
|
+
|
2116
|
+
nclusters (input) int
|
2117
|
+
The number of clusters.
|
2118
|
+
|
2119
|
+
nrows (input) int
|
2120
|
+
The number of rows in the gene expression data matrix, equal to the number of
|
2121
|
+
genes.
|
2122
|
+
|
2123
|
+
ncolumns (input) int
|
2124
|
+
The number of columns in the gene expression data matrix, equal to the number of
|
2125
|
+
microarrays.
|
2126
|
+
|
2127
|
+
data (input) double[nrows][ncolumns]
|
2128
|
+
The array containing the gene expression data.
|
2129
|
+
|
2130
|
+
mask (input) int[nrows][ncolumns]
|
2131
|
+
This array shows which data values are missing. If mask[i][j]==0, then
|
2132
|
+
data[i][j] is missing.
|
2133
|
+
|
2134
|
+
clusterid (output) int[nrows] if transpose==0
|
2135
|
+
int[ncolumns] if transpose==1
|
2136
|
+
The cluster number to which each element belongs. If transpose==0, then the
|
2137
|
+
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
|
2138
|
+
is equal to ncolumns (the number of microarrays).
|
2139
|
+
|
2140
|
+
cdata (output) double[nclusters][ncolumns] if transpose==0
|
2141
|
+
double[nrows][nclusters] if transpose==1
|
2142
|
+
On exit of getclustermedians, this array contains the cluster centroids.
|
2143
|
+
|
2144
|
+
cmask (output) int[nclusters][ncolumns] if transpose==0
|
2145
|
+
int[nrows][nclusters] if transpose==1
|
2146
|
+
This array shows which data values of are missing for each centroid. If
|
2147
|
+
cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
|
2148
|
+
a centroid if all corresponding data values of the cluster members are missing.
|
2149
|
+
|
2150
|
+
transpose (input) int
|
2151
|
+
If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
|
2152
|
+
columns (microarrays) are specified.
|
2153
|
+
|
2154
|
+
cache (input) double[nrows] if transpose==0
|
2155
|
+
double[ncolumns] if transpose==1
|
2156
|
+
This array should be allocated before calling getclustermedians; its contents
|
2157
|
+
on input is not relevant. This array is used as a temporary storage space when
|
2158
|
+
calculating the medians.
|
2159
|
+
|
2160
|
+
========================================================================
|
2161
|
+
*/
|
2162
|
+
{ int i, j, k;
|
2163
|
+
if (transpose==0)
|
2164
|
+
{ for (i = 0; i < nclusters; i++)
|
2165
|
+
{ for (j = 0; j < ncolumns; j++)
|
2166
|
+
{ int count = 0;
|
2167
|
+
for (k = 0; k < nrows; k++)
|
2168
|
+
{ if (i==clusterid[k] && mask[k][j])
|
2169
|
+
{ cache[count] = data[k][j];
|
2170
|
+
count++;
|
2171
|
+
}
|
2172
|
+
}
|
2173
|
+
if (count>0)
|
2174
|
+
{ cdata[i][j] = median(count,cache);
|
2175
|
+
cmask[i][j] = 1;
|
2176
|
+
}
|
2177
|
+
else
|
2178
|
+
{ cdata[i][j] = 0.;
|
2179
|
+
cmask[i][j] = 0;
|
2180
|
+
}
|
2181
|
+
}
|
2182
|
+
}
|
2183
|
+
}
|
2184
|
+
else
|
2185
|
+
{ for (i = 0; i < nclusters; i++)
|
2186
|
+
{ for (j = 0; j < nrows; j++)
|
2187
|
+
{ int count = 0;
|
2188
|
+
for (k = 0; k < ncolumns; k++)
|
2189
|
+
{ if (i==clusterid[k] && mask[j][k])
|
2190
|
+
{ cache[count] = data[j][k];
|
2191
|
+
count++;
|
2192
|
+
}
|
2193
|
+
}
|
2194
|
+
if (count>0)
|
2195
|
+
{ cdata[j][i] = median(count,cache);
|
2196
|
+
cmask[j][i] = 1;
|
2197
|
+
}
|
2198
|
+
else
|
2199
|
+
{ cdata[j][i] = 0.;
|
2200
|
+
cmask[j][i] = 0;
|
2201
|
+
}
|
2202
|
+
}
|
2203
|
+
}
|
2204
|
+
}
|
2205
|
+
}
|
2206
|
+
|
2207
|
+
/* ********************************************************************* */
|
2208
|
+
|
2209
|
+
int getclustercentroids(int nclusters, int nrows, int ncolumns,
|
2210
|
+
double** data, int** mask, int clusterid[], double** cdata, int** cmask,
|
2211
|
+
int transpose, char method)
|
2212
|
+
/*
|
2213
|
+
Purpose
|
2214
|
+
=======
|
2215
|
+
|
2216
|
+
The getclustercentroids routine calculates the cluster centroids, given to
|
2217
|
+
which cluster each element belongs. Depending on the argument method, the
|
2218
|
+
centroid is defined as either the mean or the median for each dimension over
|
2219
|
+
all elements belonging to a cluster.
|
2220
|
+
|
2221
|
+
Arguments
|
2222
|
+
=========
|
2223
|
+
|
2224
|
+
nclusters (input) int
|
2225
|
+
The number of clusters.
|
2226
|
+
|
2227
|
+
nrows (input) int
|
2228
|
+
The number of rows in the gene expression data matrix, equal to the number of
|
2229
|
+
genes.
|
2230
|
+
|
2231
|
+
ncolumns (input) int
|
2232
|
+
The number of columns in the gene expression data matrix, equal to the number of
|
2233
|
+
microarrays.
|
2234
|
+
|
2235
|
+
data (input) double[nrows][ncolumns]
|
2236
|
+
The array containing the gene expression data.
|
2237
|
+
|
2238
|
+
mask (input) int[nrows][ncolumns]
|
2239
|
+
This array shows which data values are missing. If mask[i][j]==0, then
|
2240
|
+
data[i][j] is missing.
|
2241
|
+
|
2242
|
+
clusterid (output) int[nrows] if transpose==0
|
2243
|
+
int[ncolumns] if transpose==1
|
2244
|
+
The cluster number to which each element belongs. If transpose==0, then the
|
2245
|
+
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
|
2246
|
+
is equal to ncolumns (the number of microarrays).
|
2247
|
+
|
2248
|
+
cdata (output) double[nclusters][ncolumns] if transpose==0
|
2249
|
+
double[nrows][nclusters] if transpose==1
|
2250
|
+
On exit of getclustercentroids, this array contains the cluster centroids.
|
2251
|
+
|
2252
|
+
cmask (output) int[nclusters][ncolumns] if transpose==0
|
2253
|
+
int[nrows][nclusters] if transpose==1
|
2254
|
+
This array shows which data values of are missing for each centroid. If
|
2255
|
+
cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
|
2256
|
+
a centroid if all corresponding data values of the cluster members are missing.
|
2257
|
+
|
2258
|
+
transpose (input) int
|
2259
|
+
If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
|
2260
|
+
columns (microarrays) are specified.
|
2261
|
+
|
2262
|
+
method (input) char
|
2263
|
+
For method=='a', the centroid is defined as the mean over all elements
|
2264
|
+
belonging to a cluster for each dimension.
|
2265
|
+
For method=='m', the centroid is defined as the median over all elements
|
2266
|
+
belonging to a cluster for each dimension.
|
2267
|
+
|
2268
|
+
Return value
|
2269
|
+
============
|
2270
|
+
|
2271
|
+
The function returns an integer to indicate success or failure. If a
|
2272
|
+
memory error occurs, or if method is not 'm' or 'a', getclustercentroids
|
2273
|
+
returns 0. If successful, getclustercentroids returns 1.
|
2274
|
+
========================================================================
|
2275
|
+
*/
|
2276
|
+
{ switch(method)
|
2277
|
+
{ case 'm':
|
2278
|
+
{ const int nelements = (transpose==0) ? nrows : ncolumns;
|
2279
|
+
double* cache = malloc(nelements*sizeof(double));
|
2280
|
+
if (!cache) return 0;
|
2281
|
+
getclustermedians(nclusters, nrows, ncolumns, data, mask, clusterid,
|
2282
|
+
cdata, cmask, transpose, cache);
|
2283
|
+
free(cache);
|
2284
|
+
return 1;
|
2285
|
+
}
|
2286
|
+
case 'a':
|
2287
|
+
{ getclustermeans(nclusters, nrows, ncolumns, data, mask, clusterid,
|
2288
|
+
cdata, cmask, transpose);
|
2289
|
+
return 1;
|
2290
|
+
}
|
2291
|
+
}
|
2292
|
+
return 0;
|
2293
|
+
}
|
2294
|
+
|
2295
|
+
/* ********************************************************************* */
|
2296
|
+
|
2297
|
+
void getclustermedoids(int nclusters, int nelements, double** distance,
|
2298
|
+
int clusterid[], int centroids[], double errors[])
|
2299
|
+
/*
|
2300
|
+
Purpose
|
2301
|
+
=======
|
2302
|
+
|
2303
|
+
The getclustermedoids routine calculates the cluster centroids, given to which
|
2304
|
+
cluster each element belongs. The centroid is defined as the element with the
|
2305
|
+
smallest sum of distances to the other elements.
|
2306
|
+
|
2307
|
+
Arguments
|
2308
|
+
=========
|
2309
|
+
|
2310
|
+
nclusters (input) int
|
2311
|
+
The number of clusters.
|
2312
|
+
|
2313
|
+
nelements (input) int
|
2314
|
+
The total number of elements.
|
2315
|
+
|
2316
|
+
distmatrix (input) double array, ragged
|
2317
|
+
(number of rows is nelements, number of columns is equal to the row number)
|
2318
|
+
The distance matrix. To save space, the distance matrix is given in the
|
2319
|
+
form of a ragged array. The distance matrix is symmetric and has zeros
|
2320
|
+
on the diagonal. See distancematrix for a description of the content.
|
2321
|
+
|
2322
|
+
clusterid (output) int[nelements]
|
2323
|
+
The cluster number to which each element belongs.
|
2324
|
+
|
2325
|
+
centroid (output) int[nclusters]
|
2326
|
+
The index of the element that functions as the centroid for each cluster.
|
2327
|
+
|
2328
|
+
errors (output) double[nclusters]
|
2329
|
+
The within-cluster sum of distances between the items and the cluster
|
2330
|
+
centroid.
|
2331
|
+
|
2332
|
+
========================================================================
|
2333
|
+
*/
|
2334
|
+
{ int i, j, k;
|
2335
|
+
for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX;
|
2336
|
+
for (i = 0; i < nelements; i++)
|
2337
|
+
{ double d = 0.0;
|
2338
|
+
j = clusterid[i];
|
2339
|
+
for (k = 0; k < nelements; k++)
|
2340
|
+
{ if (i==k || clusterid[k]!=j) continue;
|
2341
|
+
d += (i < k ? distance[k][i] : distance[i][k]);
|
2342
|
+
if (d > errors[j]) break;
|
2343
|
+
}
|
2344
|
+
if (d < errors[j])
|
2345
|
+
{ errors[j] = d;
|
2346
|
+
centroids[j] = i;
|
2347
|
+
}
|
2348
|
+
}
|
2349
|
+
}
|
2350
|
+
|
2351
|
+
/* ********************************************************************* */
|
2352
|
+
|
2353
|
+
static int
|
2354
|
+
kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask,
|
2355
|
+
double weight[], int transpose, int npass, char dist,
|
2356
|
+
double** cdata, int** cmask, int clusterid[], double* error,
|
2357
|
+
int tclusterid[], int counts[], int mapping[])
|
2358
|
+
{ int i, j, k;
|
2359
|
+
const int nelements = (transpose==0) ? nrows : ncolumns;
|
2360
|
+
const int ndata = (transpose==0) ? ncolumns : nrows;
|
2361
|
+
int ifound = 1;
|
2362
|
+
int ipass = 0;
|
2363
|
+
/* Set the metric function as indicated by dist */
|
2364
|
+
double (*metric)
|
2365
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
2366
|
+
setmetric(dist);
|
2367
|
+
|
2368
|
+
/* We save the clustering solution periodically and check if it reappears */
|
2369
|
+
int* saved = malloc(nelements*sizeof(int));
|
2370
|
+
if (saved==NULL) return -1;
|
2371
|
+
|
2372
|
+
*error = DBL_MAX;
|
2373
|
+
|
2374
|
+
do
|
2375
|
+
{ double total = DBL_MAX;
|
2376
|
+
int counter = 0;
|
2377
|
+
int period = 10;
|
2378
|
+
|
2379
|
+
/* Perform the EM algorithm. First, randomly assign elements to clusters. */
|
2380
|
+
if (npass!=0) randomassign (nclusters, nelements, tclusterid);
|
2381
|
+
|
2382
|
+
for (i = 0; i < nclusters; i++) counts[i] = 0;
|
2383
|
+
for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
|
2384
|
+
|
2385
|
+
/* Start the loop */
|
2386
|
+
while(1)
|
2387
|
+
{ double previous = total;
|
2388
|
+
total = 0.0;
|
2389
|
+
|
2390
|
+
if (counter % period == 0) /* Save the current cluster assignments */
|
2391
|
+
{ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
|
2392
|
+
if (period < INT_MAX / 2) period *= 2;
|
2393
|
+
}
|
2394
|
+
counter++;
|
2395
|
+
|
2396
|
+
/* Find the center */
|
2397
|
+
getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid,
|
2398
|
+
cdata, cmask, transpose);
|
2399
|
+
|
2400
|
+
for (i = 0; i < nelements; i++)
|
2401
|
+
/* Calculate the distances */
|
2402
|
+
{ double distance;
|
2403
|
+
k = tclusterid[i];
|
2404
|
+
if (counts[k]==1) continue;
|
2405
|
+
/* No reassignment if that would lead to an empty cluster */
|
2406
|
+
/* Treat the present cluster as a special case */
|
2407
|
+
distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
|
2408
|
+
for (j = 0; j < nclusters; j++)
|
2409
|
+
{ double tdistance;
|
2410
|
+
if (j==k) continue;
|
2411
|
+
tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
|
2412
|
+
if (tdistance < distance)
|
2413
|
+
{ distance = tdistance;
|
2414
|
+
counts[tclusterid[i]]--;
|
2415
|
+
tclusterid[i] = j;
|
2416
|
+
counts[j]++;
|
2417
|
+
}
|
2418
|
+
}
|
2419
|
+
total += distance;
|
2420
|
+
}
|
2421
|
+
if (total>=previous) break;
|
2422
|
+
/* total>=previous is FALSE on some machines even if total and previous
|
2423
|
+
* are bitwise identical. */
|
2424
|
+
for (i = 0; i < nelements; i++)
|
2425
|
+
if (saved[i]!=tclusterid[i]) break;
|
2426
|
+
if (i==nelements)
|
2427
|
+
break; /* Identical solution found; break out of this loop */
|
2428
|
+
}
|
2429
|
+
|
2430
|
+
if (npass<=1)
|
2431
|
+
{ *error = total;
|
2432
|
+
break;
|
2433
|
+
}
|
2434
|
+
|
2435
|
+
for (i = 0; i < nclusters; i++) mapping[i] = -1;
|
2436
|
+
for (i = 0; i < nelements; i++)
|
2437
|
+
{ j = tclusterid[i];
|
2438
|
+
k = clusterid[i];
|
2439
|
+
if (mapping[k] == -1) mapping[k] = j;
|
2440
|
+
else if (mapping[k] != j)
|
2441
|
+
{ if (total < *error)
|
2442
|
+
{ ifound = 1;
|
2443
|
+
*error = total;
|
2444
|
+
for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
|
2445
|
+
}
|
2446
|
+
break;
|
2447
|
+
}
|
2448
|
+
}
|
2449
|
+
if (i==nelements) ifound++; /* break statement not encountered */
|
2450
|
+
} while (++ipass < npass);
|
2451
|
+
|
2452
|
+
free(saved);
|
2453
|
+
return ifound;
|
2454
|
+
}
|
2455
|
+
|
2456
|
+
/* ---------------------------------------------------------------------- */
|
2457
|
+
|
2458
|
+
static int
|
2459
|
+
kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask,
|
2460
|
+
double weight[], int transpose, int npass, char dist,
|
2461
|
+
double** cdata, int** cmask, int clusterid[], double* error,
|
2462
|
+
int tclusterid[], int counts[], int mapping[], double cache[])
|
2463
|
+
{ int i, j, k;
|
2464
|
+
const int nelements = (transpose==0) ? nrows : ncolumns;
|
2465
|
+
const int ndata = (transpose==0) ? ncolumns : nrows;
|
2466
|
+
int ifound = 1;
|
2467
|
+
int ipass = 0;
|
2468
|
+
/* Set the metric function as indicated by dist */
|
2469
|
+
double (*metric)
|
2470
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
2471
|
+
setmetric(dist);
|
2472
|
+
|
2473
|
+
/* We save the clustering solution periodically and check if it reappears */
|
2474
|
+
int* saved = malloc(nelements*sizeof(int));
|
2475
|
+
if (saved==NULL) return -1;
|
2476
|
+
|
2477
|
+
*error = DBL_MAX;
|
2478
|
+
|
2479
|
+
do
|
2480
|
+
{ double total = DBL_MAX;
|
2481
|
+
int counter = 0;
|
2482
|
+
int period = 10;
|
2483
|
+
|
2484
|
+
/* Perform the EM algorithm. First, randomly assign elements to clusters. */
|
2485
|
+
if (npass!=0) randomassign (nclusters, nelements, tclusterid);
|
2486
|
+
|
2487
|
+
for (i = 0; i < nclusters; i++) counts[i]=0;
|
2488
|
+
for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
|
2489
|
+
|
2490
|
+
/* Start the loop */
|
2491
|
+
while(1)
|
2492
|
+
{ double previous = total;
|
2493
|
+
total = 0.0;
|
2494
|
+
|
2495
|
+
if (counter % period == 0) /* Save the current cluster assignments */
|
2496
|
+
{ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
|
2497
|
+
if (period < INT_MAX / 2) period *= 2;
|
2498
|
+
}
|
2499
|
+
counter++;
|
2500
|
+
|
2501
|
+
/* Find the center */
|
2502
|
+
getclustermedians(nclusters, nrows, ncolumns, data, mask, tclusterid,
|
2503
|
+
cdata, cmask, transpose, cache);
|
2504
|
+
|
2505
|
+
for (i = 0; i < nelements; i++)
|
2506
|
+
/* Calculate the distances */
|
2507
|
+
{ double distance;
|
2508
|
+
k = tclusterid[i];
|
2509
|
+
if (counts[k]==1) continue;
|
2510
|
+
/* No reassignment if that would lead to an empty cluster */
|
2511
|
+
/* Treat the present cluster as a special case */
|
2512
|
+
distance = metric(ndata,data,cdata,mask,cmask,weight,i,k,transpose);
|
2513
|
+
for (j = 0; j < nclusters; j++)
|
2514
|
+
{ double tdistance;
|
2515
|
+
if (j==k) continue;
|
2516
|
+
tdistance = metric(ndata,data,cdata,mask,cmask,weight,i,j,transpose);
|
2517
|
+
if (tdistance < distance)
|
2518
|
+
{ distance = tdistance;
|
2519
|
+
counts[tclusterid[i]]--;
|
2520
|
+
tclusterid[i] = j;
|
2521
|
+
counts[j]++;
|
2522
|
+
}
|
2523
|
+
}
|
2524
|
+
total += distance;
|
2525
|
+
}
|
2526
|
+
if (total>=previous) break;
|
2527
|
+
/* total>=previous is FALSE on some machines even if total and previous
|
2528
|
+
* are bitwise identical. */
|
2529
|
+
for (i = 0; i < nelements; i++)
|
2530
|
+
if (saved[i]!=tclusterid[i]) break;
|
2531
|
+
if (i==nelements)
|
2532
|
+
break; /* Identical solution found; break out of this loop */
|
2533
|
+
}
|
2534
|
+
|
2535
|
+
if (npass<=1)
|
2536
|
+
{ *error = total;
|
2537
|
+
break;
|
2538
|
+
}
|
2539
|
+
|
2540
|
+
for (i = 0; i < nclusters; i++) mapping[i] = -1;
|
2541
|
+
for (i = 0; i < nelements; i++)
|
2542
|
+
{ j = tclusterid[i];
|
2543
|
+
k = clusterid[i];
|
2544
|
+
if (mapping[k] == -1) mapping[k] = j;
|
2545
|
+
else if (mapping[k] != j)
|
2546
|
+
{ if (total < *error)
|
2547
|
+
{ ifound = 1;
|
2548
|
+
*error = total;
|
2549
|
+
for (j = 0; j < nelements; j++) clusterid[j] = tclusterid[j];
|
2550
|
+
}
|
2551
|
+
break;
|
2552
|
+
}
|
2553
|
+
}
|
2554
|
+
if (i==nelements) ifound++; /* break statement not encountered */
|
2555
|
+
} while (++ipass < npass);
|
2556
|
+
|
2557
|
+
free(saved);
|
2558
|
+
return ifound;
|
2559
|
+
}
|
2560
|
+
|
2561
|
+
/* ********************************************************************* */
|
2562
|
+
|
2563
|
+
void kcluster (int nclusters, int nrows, int ncolumns,
|
2564
|
+
double** data, int** mask, double weight[], int transpose,
|
2565
|
+
int npass, char method, char dist,
|
2566
|
+
int clusterid[], double* error, int* ifound)
|
2567
|
+
/*
|
2568
|
+
Purpose
|
2569
|
+
=======
|
2570
|
+
|
2571
|
+
The kcluster routine performs k-means or k-median clustering on a given set of
|
2572
|
+
elements, using the specified distance measure. The number of clusters is given
|
2573
|
+
by the user. Multiple passes are being made to find the optimal clustering
|
2574
|
+
solution, each time starting from a different initial clustering.
|
2575
|
+
|
2576
|
+
|
2577
|
+
Arguments
|
2578
|
+
=========
|
2579
|
+
|
2580
|
+
nclusters (input) int
|
2581
|
+
The number of clusters to be found.
|
2582
|
+
|
2583
|
+
data (input) double[nrows][ncolumns]
|
2584
|
+
The array containing the data of the elements to be clustered (i.e., the gene
|
2585
|
+
expression data).
|
2586
|
+
|
2587
|
+
mask (input) int[nrows][ncolumns]
|
2588
|
+
This array shows which data values are missing. If
|
2589
|
+
mask[i][j] == 0, then data[i][j] is missing.
|
2590
|
+
|
2591
|
+
nrows (input) int
|
2592
|
+
The number of rows in the data matrix, equal to the number of genes.
|
2593
|
+
|
2594
|
+
ncolumns (input) int
|
2595
|
+
The number of columns in the data matrix, equal to the number of microarrays.
|
2596
|
+
|
2597
|
+
weight (input) double[n]
|
2598
|
+
The weights that are used to calculate the distance.
|
2599
|
+
|
2600
|
+
transpose (input) int
|
2601
|
+
If transpose==0, the rows of the matrix are clustered. Otherwise, columns
|
2602
|
+
of the matrix are clustered.
|
2603
|
+
|
2604
|
+
npass (input) int
|
2605
|
+
The number of times clustering is performed. Clustering is performed npass
|
2606
|
+
times, each time starting from a different (random) initial assignment of
|
2607
|
+
genes to clusters. The clustering solution with the lowest within-cluster sum
|
2608
|
+
of distances is chosen.
|
2609
|
+
If npass==0, then the clustering algorithm will be run once, where the initial
|
2610
|
+
assignment of elements to clusters is taken from the clusterid array.
|
2611
|
+
|
2612
|
+
method (input) char
|
2613
|
+
Defines whether the arithmetic mean (method=='a') or the median
|
2614
|
+
(method=='m') is used to calculate the cluster center.
|
2615
|
+
|
2616
|
+
dist (input) char
|
2617
|
+
Defines which distance measure is used, as given by the table:
|
2618
|
+
dist=='e': Euclidean distance
|
2619
|
+
dist=='b': City-block distance
|
2620
|
+
dist=='c': correlation
|
2621
|
+
dist=='a': absolute value of the correlation
|
2622
|
+
dist=='u': uncentered correlation
|
2623
|
+
dist=='x': absolute uncentered correlation
|
2624
|
+
dist=='s': Spearman's rank correlation
|
2625
|
+
dist=='k': Kendall's tau
|
2626
|
+
For other values of dist, the default (Euclidean distance) is used.
|
2627
|
+
|
2628
|
+
clusterid (output; input) int[nrows] if transpose==0
|
2629
|
+
int[ncolumns] if transpose==1
|
2630
|
+
The cluster number to which a gene or microarray was assigned. If npass==0,
|
2631
|
+
then on input clusterid contains the initial clustering assignment from which
|
2632
|
+
the clustering algorithm starts. On output, it contains the clustering solution
|
2633
|
+
that was found.
|
2634
|
+
|
2635
|
+
error (output) double*
|
2636
|
+
The sum of distances to the cluster center of each item in the optimal k-means
|
2637
|
+
clustering solution that was found.
|
2638
|
+
|
2639
|
+
ifound (output) int*
|
2640
|
+
The number of times the optimal clustering solution was
|
2641
|
+
found. The value of ifound is at least 1; its maximum value is npass. If the
|
2642
|
+
number of clusters is larger than the number of elements being clustered,
|
2643
|
+
*ifound is set to 0 as an error code. If a memory allocation error occurs,
|
2644
|
+
*ifound is set to -1.
|
2645
|
+
|
2646
|
+
========================================================================
|
2647
|
+
*/
|
2648
|
+
{ const int nelements = (transpose==0) ? nrows : ncolumns;
|
2649
|
+
const int ndata = (transpose==0) ? ncolumns : nrows;
|
2650
|
+
|
2651
|
+
int i;
|
2652
|
+
int ok;
|
2653
|
+
int* tclusterid;
|
2654
|
+
int* mapping = NULL;
|
2655
|
+
double** cdata;
|
2656
|
+
int** cmask;
|
2657
|
+
int* counts;
|
2658
|
+
|
2659
|
+
if (nelements < nclusters)
|
2660
|
+
{ *ifound = 0;
|
2661
|
+
return;
|
2662
|
+
}
|
2663
|
+
/* More clusters asked for than elements available */
|
2664
|
+
|
2665
|
+
*ifound = -1;
|
2666
|
+
|
2667
|
+
/* This will contain the number of elements in each cluster, which is
|
2668
|
+
* needed to check for empty clusters. */
|
2669
|
+
counts = malloc(nclusters*sizeof(int));
|
2670
|
+
if(!counts) return;
|
2671
|
+
|
2672
|
+
/* Find out if the user specified an initial clustering */
|
2673
|
+
if (npass<=1) tclusterid = clusterid;
|
2674
|
+
else
|
2675
|
+
{ tclusterid = malloc(nelements*sizeof(int));
|
2676
|
+
if (!tclusterid)
|
2677
|
+
{ free(counts);
|
2678
|
+
return;
|
2679
|
+
}
|
2680
|
+
mapping = malloc(nclusters*sizeof(int));
|
2681
|
+
if (!mapping)
|
2682
|
+
{ free(counts);
|
2683
|
+
free(tclusterid);
|
2684
|
+
return;
|
2685
|
+
}
|
2686
|
+
for (i = 0; i < nelements; i++) clusterid[i] = 0;
|
2687
|
+
}
|
2688
|
+
|
2689
|
+
/* Allocate space to store the centroid data */
|
2690
|
+
if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata, &cmask);
|
2691
|
+
else ok = makedatamask(ndata, nclusters, &cdata, &cmask);
|
2692
|
+
if(!ok)
|
2693
|
+
{ free(counts);
|
2694
|
+
if(npass>1)
|
2695
|
+
{ free(tclusterid);
|
2696
|
+
free(mapping);
|
2697
|
+
return;
|
2698
|
+
}
|
2699
|
+
}
|
2700
|
+
|
2701
|
+
if (method=='m')
|
2702
|
+
{ double* cache = malloc(nelements*sizeof(double));
|
2703
|
+
if(cache)
|
2704
|
+
{ *ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight,
|
2705
|
+
transpose, npass, dist, cdata, cmask, clusterid, error,
|
2706
|
+
tclusterid, counts, mapping, cache);
|
2707
|
+
free(cache);
|
2708
|
+
}
|
2709
|
+
}
|
2710
|
+
else
|
2711
|
+
*ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight,
|
2712
|
+
transpose, npass, dist, cdata, cmask, clusterid, error,
|
2713
|
+
tclusterid, counts, mapping);
|
2714
|
+
|
2715
|
+
/* Deallocate temporarily used space */
|
2716
|
+
if (npass > 1)
|
2717
|
+
{ free(mapping);
|
2718
|
+
free(tclusterid);
|
2719
|
+
}
|
2720
|
+
|
2721
|
+
if (transpose==0) freedatamask(nclusters, cdata, cmask);
|
2722
|
+
else freedatamask(ndata, cdata, cmask);
|
2723
|
+
|
2724
|
+
free(counts);
|
2725
|
+
}
|
2726
|
+
|
2727
|
+
/* *********************************************************************** */
|
2728
|
+
|
2729
|
+
void kmedoids (int nclusters, int nelements, double** distmatrix,
|
2730
|
+
int npass, int clusterid[], double* error, int* ifound)
|
2731
|
+
/*
|
2732
|
+
Purpose
|
2733
|
+
=======
|
2734
|
+
|
2735
|
+
The kmedoids routine performs k-medoids clustering on a given set of elements,
|
2736
|
+
using the distance matrix and the number of clusters passed by the user.
|
2737
|
+
Multiple passes are being made to find the optimal clustering solution, each
|
2738
|
+
time starting from a different initial clustering.
|
2739
|
+
|
2740
|
+
|
2741
|
+
Arguments
|
2742
|
+
=========
|
2743
|
+
|
2744
|
+
nclusters (input) int
|
2745
|
+
The number of clusters to be found.
|
2746
|
+
|
2747
|
+
nelements (input) int
|
2748
|
+
The number of elements to be clustered.
|
2749
|
+
|
2750
|
+
distmatrix (input) double array, ragged
|
2751
|
+
(number of rows is nelements, number of columns is equal to the row number)
|
2752
|
+
The distance matrix. To save space, the distance matrix is given in the
|
2753
|
+
form of a ragged array. The distance matrix is symmetric and has zeros
|
2754
|
+
on the diagonal. See distancematrix for a description of the content.
|
2755
|
+
|
2756
|
+
npass (input) int
|
2757
|
+
The number of times clustering is performed. Clustering is performed npass
|
2758
|
+
times, each time starting from a different (random) initial assignment of genes
|
2759
|
+
to clusters. The clustering solution with the lowest within-cluster sum of
|
2760
|
+
distances is chosen.
|
2761
|
+
If npass==0, then the clustering algorithm will be run once, where the initial
|
2762
|
+
assignment of elements to clusters is taken from the clusterid array.
|
2763
|
+
|
2764
|
+
clusterid (output; input) int[nelements]
|
2765
|
+
On input, if npass==0, then clusterid contains the initial clustering assignment
|
2766
|
+
from which the clustering algorithm starts; all numbers in clusterid should be
|
2767
|
+
between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on
|
2768
|
+
input.
|
2769
|
+
On output, clusterid contains the clustering solution that was found: clusterid
|
2770
|
+
contains the number of the cluster to which each item was assigned. On output,
|
2771
|
+
the number of a cluster is defined as the item number of the centroid of the
|
2772
|
+
cluster.
|
2773
|
+
|
2774
|
+
error (output) double
|
2775
|
+
The sum of distances to the cluster center of each item in the optimal k-medoids
|
2776
|
+
clustering solution that was found.
|
2777
|
+
|
2778
|
+
ifound (output) int
|
2779
|
+
If kmedoids is successful: the number of times the optimal clustering solution
|
2780
|
+
was found. The value of ifound is at least 1; its maximum value is npass.
|
2781
|
+
If the user requested more clusters than elements available, ifound is set
|
2782
|
+
to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
|
2783
|
+
|
2784
|
+
========================================================================
|
2785
|
+
*/
|
2786
|
+
{ int i, j, icluster;
|
2787
|
+
int* tclusterid;
|
2788
|
+
int* saved;
|
2789
|
+
int* centroids;
|
2790
|
+
double* errors;
|
2791
|
+
int ipass = 0;
|
2792
|
+
|
2793
|
+
if (nelements < nclusters)
|
2794
|
+
{ *ifound = 0;
|
2795
|
+
return;
|
2796
|
+
} /* More clusters asked for than elements available */
|
2797
|
+
|
2798
|
+
*ifound = -1;
|
2799
|
+
|
2800
|
+
/* We save the clustering solution periodically and check if it reappears */
|
2801
|
+
saved = malloc(nelements*sizeof(int));
|
2802
|
+
if (saved==NULL) return;
|
2803
|
+
|
2804
|
+
centroids = malloc(nclusters*sizeof(int));
|
2805
|
+
if(!centroids)
|
2806
|
+
{ free(saved);
|
2807
|
+
return;
|
2808
|
+
}
|
2809
|
+
|
2810
|
+
errors = malloc(nclusters*sizeof(double));
|
2811
|
+
if(!errors)
|
2812
|
+
{ free(saved);
|
2813
|
+
free(centroids);
|
2814
|
+
return;
|
2815
|
+
}
|
2816
|
+
|
2817
|
+
/* Find out if the user specified an initial clustering */
|
2818
|
+
if (npass<=1) tclusterid = clusterid;
|
2819
|
+
else
|
2820
|
+
{ tclusterid = malloc(nelements*sizeof(int));
|
2821
|
+
if(!tclusterid)
|
2822
|
+
{ free(saved);
|
2823
|
+
free(centroids);
|
2824
|
+
free(errors);
|
2825
|
+
return;
|
2826
|
+
}
|
2827
|
+
}
|
2828
|
+
|
2829
|
+
*error = DBL_MAX;
|
2830
|
+
do /* Start the loop */
|
2831
|
+
{ double total = DBL_MAX;
|
2832
|
+
int counter = 0;
|
2833
|
+
int period = 10;
|
2834
|
+
|
2835
|
+
if (npass!=0) randomassign (nclusters, nelements, tclusterid);
|
2836
|
+
while(1)
|
2837
|
+
{ double previous = total;
|
2838
|
+
total = 0.0;
|
2839
|
+
|
2840
|
+
if (counter % period == 0) /* Save the current cluster assignments */
|
2841
|
+
{ for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
|
2842
|
+
if (period < INT_MAX / 2) period *= 2;
|
2843
|
+
}
|
2844
|
+
counter++;
|
2845
|
+
|
2846
|
+
/* Find the center */
|
2847
|
+
getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
|
2848
|
+
centroids, errors);
|
2849
|
+
|
2850
|
+
for (i = 0; i < nelements; i++)
|
2851
|
+
/* Find the closest cluster */
|
2852
|
+
{ double distance = DBL_MAX;
|
2853
|
+
for (icluster = 0; icluster < nclusters; icluster++)
|
2854
|
+
{ double tdistance;
|
2855
|
+
j = centroids[icluster];
|
2856
|
+
if (i==j)
|
2857
|
+
{ distance = 0.0;
|
2858
|
+
tclusterid[i] = icluster;
|
2859
|
+
break;
|
2860
|
+
}
|
2861
|
+
tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
|
2862
|
+
if (tdistance < distance)
|
2863
|
+
{ distance = tdistance;
|
2864
|
+
tclusterid[i] = icluster;
|
2865
|
+
}
|
2866
|
+
}
|
2867
|
+
total += distance;
|
2868
|
+
}
|
2869
|
+
if (total>=previous) break;
|
2870
|
+
/* total>=previous is FALSE on some machines even if total and previous
|
2871
|
+
* are bitwise identical. */
|
2872
|
+
for (i = 0; i < nelements; i++)
|
2873
|
+
if (saved[i]!=tclusterid[i]) break;
|
2874
|
+
if (i==nelements)
|
2875
|
+
break; /* Identical solution found; break out of this loop */
|
2876
|
+
}
|
2877
|
+
|
2878
|
+
for (i = 0; i < nelements; i++)
|
2879
|
+
{ if (clusterid[i]!=centroids[tclusterid[i]])
|
2880
|
+
{ if (total < *error)
|
2881
|
+
{ *ifound = 1;
|
2882
|
+
*error = total;
|
2883
|
+
/* Replace by the centroid in each cluster. */
|
2884
|
+
for (j = 0; j < nelements; j++)
|
2885
|
+
clusterid[j] = centroids[tclusterid[j]];
|
2886
|
+
}
|
2887
|
+
break;
|
2888
|
+
}
|
2889
|
+
}
|
2890
|
+
if (i==nelements) (*ifound)++; /* break statement not encountered */
|
2891
|
+
} while (++ipass < npass);
|
2892
|
+
|
2893
|
+
/* Deallocate temporarily used space */
|
2894
|
+
if (npass > 1) free(tclusterid);
|
2895
|
+
|
2896
|
+
free(saved);
|
2897
|
+
free(centroids);
|
2898
|
+
free(errors);
|
2899
|
+
|
2900
|
+
return;
|
2901
|
+
}
|
2902
|
+
|
2903
|
+
/* ******************************************************************** */
|
2904
|
+
|
2905
|
+
double** distancematrix (int nrows, int ncolumns, double** data,
|
2906
|
+
int** mask, double weights[], char dist, int transpose)
|
2907
|
+
/*
|
2908
|
+
Purpose
|
2909
|
+
=======
|
2910
|
+
|
2911
|
+
The distancematrix routine calculates the distance matrix between genes or
|
2912
|
+
microarrays using their measured gene expression data. Several distance measures
|
2913
|
+
can be used. The routine returns a pointer to a ragged array containing the
|
2914
|
+
distances between the genes. As the distance matrix is symmetric, with zeros on
|
2915
|
+
the diagonal, only the lower triangular half of the distance matrix is saved.
|
2916
|
+
The distancematrix routine allocates space for the distance matrix. If the
|
2917
|
+
parameter transpose is set to a nonzero value, the distances between the columns
|
2918
|
+
(microarrays) are calculated, otherwise distances between the rows (genes) are
|
2919
|
+
calculated.
|
2920
|
+
If sufficient space in memory cannot be allocated to store the distance matrix,
|
2921
|
+
the routine returns a NULL pointer, and all memory allocated so far for the
|
2922
|
+
distance matrix is freed.
|
2923
|
+
|
2924
|
+
|
2925
|
+
Arguments
|
2926
|
+
=========
|
2927
|
+
|
2928
|
+
nrows (input) int
|
2929
|
+
The number of rows in the gene expression data matrix (i.e., the number of
|
2930
|
+
genes)
|
2931
|
+
|
2932
|
+
ncolumns (input) int
|
2933
|
+
The number of columns in the gene expression data matrix (i.e., the number of
|
2934
|
+
microarrays)
|
2935
|
+
|
2936
|
+
data (input) double[nrows][ncolumns]
|
2937
|
+
The array containing the gene expression data.
|
2938
|
+
|
2939
|
+
mask (input) int[nrows][ncolumns]
|
2940
|
+
This array shows which data values are missing. If mask[i][j]==0, then
|
2941
|
+
data[i][j] is missing.
|
2942
|
+
|
2943
|
+
weight (input) double[n]
|
2944
|
+
The weights that are used to calculate the distance. The length of this vector
|
2945
|
+
is equal to the number of columns if the distances between genes are calculated,
|
2946
|
+
or the number of rows if the distances between microarrays are calculated.
|
2947
|
+
|
2948
|
+
dist (input) char
|
2949
|
+
Defines which distance measure is used, as given by the table:
|
2950
|
+
dist=='e': Euclidean distance
|
2951
|
+
dist=='b': City-block distance
|
2952
|
+
dist=='c': correlation
|
2953
|
+
dist=='a': absolute value of the correlation
|
2954
|
+
dist=='u': uncentered correlation
|
2955
|
+
dist=='x': absolute uncentered correlation
|
2956
|
+
dist=='s': Spearman's rank correlation
|
2957
|
+
dist=='k': Kendall's tau
|
2958
|
+
For other values of dist, the default (Euclidean distance) is used.
|
2959
|
+
|
2960
|
+
transpose (input) int
|
2961
|
+
If transpose is equal to zero, the distances between the rows is
|
2962
|
+
calculated. Otherwise, the distances between the columns is calculated.
|
2963
|
+
The former is needed when genes are being clustered; the latter is used
|
2964
|
+
when microarrays are being clustered.
|
2965
|
+
|
2966
|
+
========================================================================
|
2967
|
+
*/
|
2968
|
+
{ /* First determine the size of the distance matrix */
|
2969
|
+
const int n = (transpose==0) ? nrows : ncolumns;
|
2970
|
+
const int ndata = (transpose==0) ? ncolumns : nrows;
|
2971
|
+
int i,j;
|
2972
|
+
double** matrix;
|
2973
|
+
|
2974
|
+
/* Set the metric function as indicated by dist */
|
2975
|
+
double (*metric)
|
2976
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
2977
|
+
setmetric(dist);
|
2978
|
+
|
2979
|
+
if (n < 2) return NULL;
|
2980
|
+
|
2981
|
+
/* Set up the ragged array */
|
2982
|
+
matrix = malloc(n*sizeof(double*));
|
2983
|
+
if(matrix==NULL) return NULL; /* Not enough memory available */
|
2984
|
+
matrix[0] = NULL;
|
2985
|
+
/* The zeroth row has zero columns. We allocate it anyway for convenience.*/
|
2986
|
+
for (i = 1; i < n; i++)
|
2987
|
+
{ matrix[i] = malloc(i*sizeof(double));
|
2988
|
+
if (matrix[i]==NULL) break; /* Not enough memory available */
|
2989
|
+
}
|
2990
|
+
if (i < n) /* break condition encountered */
|
2991
|
+
{ j = i;
|
2992
|
+
for (i = 1; i < j; i++) free(matrix[i]);
|
2993
|
+
return NULL;
|
2994
|
+
}
|
2995
|
+
|
2996
|
+
/* Calculate the distances and save them in the ragged array */
|
2997
|
+
for (i = 1; i < n; i++)
|
2998
|
+
for (j = 0; j < i; j++)
|
2999
|
+
matrix[i][j]=metric(ndata,data,data,mask,mask,weights,i,j,transpose);
|
3000
|
+
|
3001
|
+
return matrix;
|
3002
|
+
}
|
3003
|
+
|
3004
|
+
/* ******************************************************************** */
|
3005
|
+
|
3006
|
+
double* calculate_weights(int nrows, int ncolumns, double** data, int** mask,
|
3007
|
+
double weights[], int transpose, char dist, double cutoff, double exponent)
|
3008
|
+
|
3009
|
+
/*
|
3010
|
+
Purpose
|
3011
|
+
=======
|
3012
|
+
|
3013
|
+
This function calculates the weights using the weighting scheme proposed by
|
3014
|
+
Michael Eisen:
|
3015
|
+
w[i] = 1.0 / sum_{j where d[i][j]<cutoff} (1 - d[i][j]/cutoff)^exponent
|
3016
|
+
where the cutoff and the exponent are specified by the user.
|
3017
|
+
|
3018
|
+
|
3019
|
+
Arguments
|
3020
|
+
=========
|
3021
|
+
|
3022
|
+
nrows (input) int
|
3023
|
+
The number of rows in the gene expression data matrix, equal to the number of
|
3024
|
+
genes.
|
3025
|
+
|
3026
|
+
ncolumns (input) int
|
3027
|
+
The number of columns in the gene expression data matrix, equal to the number of
|
3028
|
+
microarrays.
|
3029
|
+
|
3030
|
+
data (input) double[nrows][ncolumns]
|
3031
|
+
The array containing the gene expression data.
|
3032
|
+
|
3033
|
+
mask (input) int[nrows][ncolumns]
|
3034
|
+
This array shows which data values are missing. If mask[i][j]==0, then
|
3035
|
+
data[i][j] is missing.
|
3036
|
+
|
3037
|
+
weight (input) int[ncolumns] if transpose==0,
|
3038
|
+
int[nrows] if transpose==1
|
3039
|
+
The weights that are used to calculate the distance. The length of this vector
|
3040
|
+
is ncolumns if gene weights are being clustered, and nrows if microarrays
|
3041
|
+
weights are being clustered.
|
3042
|
+
|
3043
|
+
transpose (input) int
|
3044
|
+
If transpose==0, the weights of the rows of the data matrix are calculated.
|
3045
|
+
Otherwise, the weights of the columns of the data matrix are calculated.
|
3046
|
+
|
3047
|
+
dist (input) char
|
3048
|
+
Defines which distance measure is used, as given by the table:
|
3049
|
+
dist=='e': Euclidean distance
|
3050
|
+
dist=='b': City-block distance
|
3051
|
+
dist=='c': correlation
|
3052
|
+
dist=='a': absolute value of the correlation
|
3053
|
+
dist=='u': uncentered correlation
|
3054
|
+
dist=='x': absolute uncentered correlation
|
3055
|
+
dist=='s': Spearman's rank correlation
|
3056
|
+
dist=='k': Kendall's tau
|
3057
|
+
For other values of dist, the default (Euclidean distance) is used.
|
3058
|
+
|
3059
|
+
cutoff (input) double
|
3060
|
+
The cutoff to be used to calculate the weights.
|
3061
|
+
|
3062
|
+
exponent (input) double
|
3063
|
+
The exponent to be used to calculate the weights.
|
3064
|
+
|
3065
|
+
|
3066
|
+
Return value
|
3067
|
+
============
|
3068
|
+
|
3069
|
+
The function returns a pointer to a newly allocated array containing the
|
3070
|
+
calculated weights for the rows (if transpose==0) or columns (if
|
3071
|
+
transpose==1). If not enough memory could be allocated to store the
|
3072
|
+
weights array, the function returns NULL.
|
3073
|
+
|
3074
|
+
========================================================================
|
3075
|
+
*/
|
3076
|
+
{ int i,j;
|
3077
|
+
const int ndata = (transpose==0) ? ncolumns : nrows;
|
3078
|
+
const int nelements = (transpose==0) ? nrows : ncolumns;
|
3079
|
+
|
3080
|
+
/* Set the metric function as indicated by dist */
|
3081
|
+
double (*metric)
|
3082
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
3083
|
+
setmetric(dist);
|
3084
|
+
|
3085
|
+
double* result = malloc(nelements*sizeof(double));
|
3086
|
+
if (!result) return NULL;
|
3087
|
+
memset(result, 0, nelements*sizeof(double));
|
3088
|
+
|
3089
|
+
for (i = 0; i < nelements; i++)
|
3090
|
+
{ result[i] += 1.0;
|
3091
|
+
for (j = 0; j < i; j++)
|
3092
|
+
{ const double distance = metric(ndata, data, data, mask, mask, weights,
|
3093
|
+
i, j, transpose);
|
3094
|
+
if (distance < cutoff)
|
3095
|
+
{ const double dweight = exp(exponent*log(1-distance/cutoff));
|
3096
|
+
/* pow() causes a crash on AIX */
|
3097
|
+
result[i] += dweight;
|
3098
|
+
result[j] += dweight;
|
3099
|
+
}
|
3100
|
+
}
|
3101
|
+
}
|
3102
|
+
for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
|
3103
|
+
return result;
|
3104
|
+
}
|
3105
|
+
|
3106
|
+
/* ******************************************************************** */
|
3107
|
+
|
3108
|
+
void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
|
3109
|
+
|
3110
|
+
/*
|
3111
|
+
Purpose
|
3112
|
+
=======
|
3113
|
+
|
3114
|
+
The cuttree routine takes the output of a hierarchical clustering routine, and
|
3115
|
+
divides the elements in the tree structure into clusters based on the
|
3116
|
+
hierarchical clustering result. The number of clusters is specified by the user.
|
3117
|
+
|
3118
|
+
Arguments
|
3119
|
+
=========
|
3120
|
+
|
3121
|
+
nelements (input) int
|
3122
|
+
The number of elements that were clustered.
|
3123
|
+
|
3124
|
+
tree (input) Node[nelements-1]
|
3125
|
+
The clustering solution. Each node in the array describes one linking event,
|
3126
|
+
with tree[i].left and tree[i].right representig the elements that were joined.
|
3127
|
+
The original elements are numbered 0..nelements-1, nodes are numbered
|
3128
|
+
-1..-(nelements-1).
|
3129
|
+
|
3130
|
+
nclusters (input) int
|
3131
|
+
The number of clusters to be formed.
|
3132
|
+
|
3133
|
+
clusterid (output) int[nelements]
|
3134
|
+
The number of the cluster to which each element was assigned. Space for this
|
3135
|
+
array should be allocated before calling the cuttree routine. If a memory
|
3136
|
+
error occured, all elements in clusterid are set to -1.
|
3137
|
+
|
3138
|
+
========================================================================
|
3139
|
+
*/
|
3140
|
+
{ int i, j, k;
|
3141
|
+
int icluster = 0;
|
3142
|
+
const int n = nelements-nclusters; /* number of nodes to join */
|
3143
|
+
int* nodeid;
|
3144
|
+
for (i = nelements-2; i >= n; i--)
|
3145
|
+
{ k = tree[i].left;
|
3146
|
+
if (k>=0)
|
3147
|
+
{ clusterid[k] = icluster;
|
3148
|
+
icluster++;
|
3149
|
+
}
|
3150
|
+
k = tree[i].right;
|
3151
|
+
if (k>=0)
|
3152
|
+
{ clusterid[k] = icluster;
|
3153
|
+
icluster++;
|
3154
|
+
}
|
3155
|
+
}
|
3156
|
+
nodeid = malloc(n*sizeof(int));
|
3157
|
+
if(!nodeid)
|
3158
|
+
{ for (i = 0; i < nelements; i++) clusterid[i] = -1;
|
3159
|
+
return;
|
3160
|
+
}
|
3161
|
+
for (i = 0; i < n; i++) nodeid[i] = -1;
|
3162
|
+
for (i = n-1; i >= 0; i--)
|
3163
|
+
{ if(nodeid[i]<0)
|
3164
|
+
{ j = icluster;
|
3165
|
+
nodeid[i] = j;
|
3166
|
+
icluster++;
|
3167
|
+
}
|
3168
|
+
else j = nodeid[i];
|
3169
|
+
k = tree[i].left;
|
3170
|
+
if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
|
3171
|
+
k = tree[i].right;
|
3172
|
+
if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
|
3173
|
+
}
|
3174
|
+
free(nodeid);
|
3175
|
+
return;
|
3176
|
+
}
|
3177
|
+
|
3178
|
+
/* ******************************************************************** */
|
3179
|
+
|
3180
|
+
static
|
3181
|
+
Node* pclcluster (int nrows, int ncolumns, double** data, int** mask,
|
3182
|
+
double weight[], double** distmatrix, char dist, int transpose)
|
3183
|
+
|
3184
|
+
/*
|
3185
|
+
|
3186
|
+
Purpose
|
3187
|
+
=======
|
3188
|
+
|
3189
|
+
The pclcluster routine performs clustering using pairwise centroid-linking
|
3190
|
+
on a given set of gene expression data, using the distance metric given by dist.
|
3191
|
+
|
3192
|
+
Arguments
|
3193
|
+
=========
|
3194
|
+
|
3195
|
+
nrows (input) int
|
3196
|
+
The number of rows in the gene expression data matrix, equal to the number of
|
3197
|
+
genes.
|
3198
|
+
|
3199
|
+
ncolumns (input) int
|
3200
|
+
The number of columns in the gene expression data matrix, equal to the number of
|
3201
|
+
microarrays.
|
3202
|
+
|
3203
|
+
data (input) double[nrows][ncolumns]
|
3204
|
+
The array containing the gene expression data.
|
3205
|
+
|
3206
|
+
mask (input) int[nrows][ncolumns]
|
3207
|
+
This array shows which data values are missing. If
|
3208
|
+
mask[i][j] == 0, then data[i][j] is missing.
|
3209
|
+
|
3210
|
+
weight (input) double[ncolumns] if transpose==0;
|
3211
|
+
double[nrows] if transpose==1
|
3212
|
+
The weights that are used to calculate the distance. The length of this vector
|
3213
|
+
is ncolumns if genes are being clustered, and nrows if microarrays are being
|
3214
|
+
clustered.
|
3215
|
+
|
3216
|
+
transpose (input) int
|
3217
|
+
If transpose==0, the rows of the matrix are clustered. Otherwise, columns
|
3218
|
+
of the matrix are clustered.
|
3219
|
+
|
3220
|
+
dist (input) char
|
3221
|
+
Defines which distance measure is used, as given by the table:
|
3222
|
+
dist=='e': Euclidean distance
|
3223
|
+
dist=='b': City-block distance
|
3224
|
+
dist=='c': correlation
|
3225
|
+
dist=='a': absolute value of the correlation
|
3226
|
+
dist=='u': uncentered correlation
|
3227
|
+
dist=='x': absolute uncentered correlation
|
3228
|
+
dist=='s': Spearman's rank correlation
|
3229
|
+
dist=='k': Kendall's tau
|
3230
|
+
For other values of dist, the default (Euclidean distance) is used.
|
3231
|
+
|
3232
|
+
distmatrix (input) double**
|
3233
|
+
The distance matrix. This matrix is precalculated by the calling routine
|
3234
|
+
treecluster. The pclcluster routine modifies the contents of distmatrix, but
|
3235
|
+
does not deallocate it.
|
3236
|
+
|
3237
|
+
Return value
|
3238
|
+
============
|
3239
|
+
|
3240
|
+
A pointer to a newly allocated array of Node structs, describing the
|
3241
|
+
hierarchical clustering solution consisting of nelements-1 nodes. Depending on
|
3242
|
+
whether genes (rows) or microarrays (columns) were clustered, nelements is
|
3243
|
+
equal to nrows or ncolumns. See src/cluster.h for a description of the Node
|
3244
|
+
structure.
|
3245
|
+
If a memory error occurs, pclcluster returns NULL.
|
3246
|
+
========================================================================
|
3247
|
+
*/
|
3248
|
+
{ int i, j;
|
3249
|
+
const int nelements = (transpose==0) ? nrows : ncolumns;
|
3250
|
+
int inode;
|
3251
|
+
const int ndata = transpose ? nrows : ncolumns;
|
3252
|
+
const int nnodes = nelements - 1;
|
3253
|
+
|
3254
|
+
/* Set the metric function as indicated by dist */
|
3255
|
+
double (*metric)
|
3256
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
3257
|
+
setmetric(dist);
|
3258
|
+
|
3259
|
+
Node* result;
|
3260
|
+
double** newdata;
|
3261
|
+
int** newmask;
|
3262
|
+
int* distid = malloc(nelements*sizeof(int));
|
3263
|
+
if(!distid) return NULL;
|
3264
|
+
result = malloc(nnodes*sizeof(Node));
|
3265
|
+
if(!result)
|
3266
|
+
{ free(distid);
|
3267
|
+
return NULL;
|
3268
|
+
}
|
3269
|
+
if(!makedatamask(nelements, ndata, &newdata, &newmask))
|
3270
|
+
{ free(result);
|
3271
|
+
free(distid);
|
3272
|
+
return NULL;
|
3273
|
+
}
|
3274
|
+
|
3275
|
+
for (i = 0; i < nelements; i++) distid[i] = i;
|
3276
|
+
/* To remember which row/column in the distance matrix contains what */
|
3277
|
+
|
3278
|
+
/* Storage for node data */
|
3279
|
+
if (transpose)
|
3280
|
+
{ for (i = 0; i < nelements; i++)
|
3281
|
+
{ for (j = 0; j < ndata; j++)
|
3282
|
+
{ newdata[i][j] = data[j][i];
|
3283
|
+
newmask[i][j] = mask[j][i];
|
3284
|
+
}
|
3285
|
+
}
|
3286
|
+
data = newdata;
|
3287
|
+
mask = newmask;
|
3288
|
+
}
|
3289
|
+
else
|
3290
|
+
{ for (i = 0; i < nelements; i++)
|
3291
|
+
{ memcpy(newdata[i], data[i], ndata*sizeof(double));
|
3292
|
+
memcpy(newmask[i], mask[i], ndata*sizeof(int));
|
3293
|
+
}
|
3294
|
+
data = newdata;
|
3295
|
+
mask = newmask;
|
3296
|
+
}
|
3297
|
+
|
3298
|
+
for (inode = 0; inode < nnodes; inode++)
|
3299
|
+
{ /* Find the pair with the shortest distance */
|
3300
|
+
int is = 1;
|
3301
|
+
int js = 0;
|
3302
|
+
result[inode].distance = find_closest_pair(nelements-inode, distmatrix, &is, &js);
|
3303
|
+
result[inode].left = distid[js];
|
3304
|
+
result[inode].right = distid[is];
|
3305
|
+
|
3306
|
+
/* Make node js the new node */
|
3307
|
+
for (i = 0; i < ndata; i++)
|
3308
|
+
{ data[js][i] = data[js][i]*mask[js][i] + data[is][i]*mask[is][i];
|
3309
|
+
mask[js][i] += mask[is][i];
|
3310
|
+
if (mask[js][i]) data[js][i] /= mask[js][i];
|
3311
|
+
}
|
3312
|
+
free(data[is]);
|
3313
|
+
free(mask[is]);
|
3314
|
+
data[is] = data[nnodes-inode];
|
3315
|
+
mask[is] = mask[nnodes-inode];
|
3316
|
+
|
3317
|
+
/* Fix the distances */
|
3318
|
+
distid[is] = distid[nnodes-inode];
|
3319
|
+
for (i = 0; i < is; i++)
|
3320
|
+
distmatrix[is][i] = distmatrix[nnodes-inode][i];
|
3321
|
+
for (i = is + 1; i < nnodes-inode; i++)
|
3322
|
+
distmatrix[i][is] = distmatrix[nnodes-inode][i];
|
3323
|
+
|
3324
|
+
distid[js] = -inode-1;
|
3325
|
+
for (i = 0; i < js; i++)
|
3326
|
+
distmatrix[js][i] = metric(ndata,data,data,mask,mask,weight,js,i,0);
|
3327
|
+
for (i = js + 1; i < nnodes-inode; i++)
|
3328
|
+
distmatrix[i][js] = metric(ndata,data,data,mask,mask,weight,js,i,0);
|
3329
|
+
}
|
3330
|
+
|
3331
|
+
/* Free temporarily allocated space */
|
3332
|
+
free(data[0]);
|
3333
|
+
free(mask[0]);
|
3334
|
+
free(data);
|
3335
|
+
free(mask);
|
3336
|
+
free(distid);
|
3337
|
+
|
3338
|
+
return result;
|
3339
|
+
}
|
3340
|
+
|
3341
|
+
/* ******************************************************************** */
|
3342
|
+
|
3343
|
+
static
|
3344
|
+
int nodecompare(const void* a, const void* b)
|
3345
|
+
/* Helper function for qsort. */
|
3346
|
+
{ const Node* node1 = (const Node*)a;
|
3347
|
+
const Node* node2 = (const Node*)b;
|
3348
|
+
const double term1 = node1->distance;
|
3349
|
+
const double term2 = node2->distance;
|
3350
|
+
if (term1 < term2) return -1;
|
3351
|
+
if (term1 > term2) return +1;
|
3352
|
+
return 0;
|
3353
|
+
}
|
3354
|
+
|
3355
|
+
/* ---------------------------------------------------------------------- */
|
3356
|
+
|
3357
|
+
static
|
3358
|
+
Node* pslcluster (int nrows, int ncolumns, double** data, int** mask,
|
3359
|
+
double weight[], double** distmatrix, char dist, int transpose)
|
3360
|
+
|
3361
|
+
/*
|
3362
|
+
|
3363
|
+
Purpose
|
3364
|
+
=======
|
3365
|
+
|
3366
|
+
The pslcluster routine performs single-linkage hierarchical clustering, using
|
3367
|
+
either the distance matrix directly, if available, or by calculating the
|
3368
|
+
distances from the data array. This implementation is based on the SLINK
|
3369
|
+
algorithm, described in:
|
3370
|
+
Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
|
3371
|
+
cluster method. The Computer Journal, 16(1): 30-34.
|
3372
|
+
The output of this algorithm is identical to conventional single-linkage
|
3373
|
+
hierarchical clustering, but is much more memory-efficient and faster. Hence,
|
3374
|
+
it can be applied to large data sets, for which the conventional single-
|
3375
|
+
linkage algorithm fails due to lack of memory.
|
3376
|
+
|
3377
|
+
|
3378
|
+
Arguments
|
3379
|
+
=========
|
3380
|
+
|
3381
|
+
nrows (input) int
|
3382
|
+
The number of rows in the gene expression data matrix, equal to the number of
|
3383
|
+
genes.
|
3384
|
+
|
3385
|
+
ncolumns (input) int
|
3386
|
+
The number of columns in the gene expression data matrix, equal to the number of
|
3387
|
+
microarrays.
|
3388
|
+
|
3389
|
+
data (input) double[nrows][ncolumns]
|
3390
|
+
The array containing the gene expression data.
|
3391
|
+
|
3392
|
+
mask (input) int[nrows][ncolumns]
|
3393
|
+
This array shows which data values are missing. If
|
3394
|
+
mask[i][j] == 0, then data[i][j] is missing.
|
3395
|
+
|
3396
|
+
weight (input) double[n]
|
3397
|
+
The weights that are used to calculate the distance. The length of this vector
|
3398
|
+
is ncolumns if genes are being clustered, and nrows if microarrays are being
|
3399
|
+
clustered.
|
3400
|
+
|
3401
|
+
transpose (input) int
|
3402
|
+
If transpose==0, the rows of the matrix are clustered. Otherwise, columns
|
3403
|
+
of the matrix are clustered.
|
3404
|
+
|
3405
|
+
dist (input) char
|
3406
|
+
Defines which distance measure is used, as given by the table:
|
3407
|
+
dist=='e': Euclidean distance
|
3408
|
+
dist=='b': City-block distance
|
3409
|
+
dist=='c': correlation
|
3410
|
+
dist=='a': absolute value of the correlation
|
3411
|
+
dist=='u': uncentered correlation
|
3412
|
+
dist=='x': absolute uncentered correlation
|
3413
|
+
dist=='s': Spearman's rank correlation
|
3414
|
+
dist=='k': Kendall's tau
|
3415
|
+
For other values of dist, the default (Euclidean distance) is used.
|
3416
|
+
|
3417
|
+
distmatrix (input) double**
|
3418
|
+
The distance matrix. If the distance matrix is passed by the calling routine
|
3419
|
+
treecluster, it is used by pslcluster to speed up the clustering calculation.
|
3420
|
+
The pslcluster routine does not modify the contents of distmatrix, and does
|
3421
|
+
not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
|
3422
|
+
by the pslcluster routine from the gene expression data (the data and mask
|
3423
|
+
arrays) and stored in temporary arrays. If distmatrix is passed, the original
|
3424
|
+
gene expression data (specified by the data and mask arguments) are not needed
|
3425
|
+
and are therefore ignored.
|
3426
|
+
|
3427
|
+
|
3428
|
+
Return value
|
3429
|
+
============
|
3430
|
+
|
3431
|
+
A pointer to a newly allocated array of Node structs, describing the
|
3432
|
+
hierarchical clustering solution consisting of nelements-1 nodes. Depending on
|
3433
|
+
whether genes (rows) or microarrays (columns) were clustered, nelements is
|
3434
|
+
equal to nrows or ncolumns. See src/cluster.h for a description of the Node
|
3435
|
+
structure.
|
3436
|
+
If a memory error occurs, pslcluster returns NULL.
|
3437
|
+
|
3438
|
+
========================================================================
|
3439
|
+
*/
|
3440
|
+
{ int i, j, k;
|
3441
|
+
const int nelements = transpose ? ncolumns : nrows;
|
3442
|
+
const int nnodes = nelements - 1;
|
3443
|
+
int* vector;
|
3444
|
+
double* temp;
|
3445
|
+
int* index;
|
3446
|
+
Node* result;
|
3447
|
+
temp = malloc(nnodes*sizeof(double));
|
3448
|
+
if(!temp) return NULL;
|
3449
|
+
index = malloc(nelements*sizeof(int));
|
3450
|
+
if(!index)
|
3451
|
+
{ free(temp);
|
3452
|
+
return NULL;
|
3453
|
+
}
|
3454
|
+
vector = malloc(nnodes*sizeof(int));
|
3455
|
+
if(!vector)
|
3456
|
+
{ free(index);
|
3457
|
+
free(temp);
|
3458
|
+
return NULL;
|
3459
|
+
}
|
3460
|
+
result = malloc(nelements*sizeof(Node));
|
3461
|
+
if(!result)
|
3462
|
+
{ free(vector);
|
3463
|
+
free(index);
|
3464
|
+
free(temp);
|
3465
|
+
return NULL;
|
3466
|
+
}
|
3467
|
+
|
3468
|
+
for (i = 0; i < nnodes; i++) vector[i] = i;
|
3469
|
+
|
3470
|
+
if(distmatrix)
|
3471
|
+
{ for (i = 0; i < nrows; i++)
|
3472
|
+
{ result[i].distance = DBL_MAX;
|
3473
|
+
for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
|
3474
|
+
for (j = 0; j < i; j++)
|
3475
|
+
{ k = vector[j];
|
3476
|
+
if (result[j].distance >= temp[j])
|
3477
|
+
{ if (result[j].distance < temp[k]) temp[k] = result[j].distance;
|
3478
|
+
result[j].distance = temp[j];
|
3479
|
+
vector[j] = i;
|
3480
|
+
}
|
3481
|
+
else if (temp[j] < temp[k]) temp[k] = temp[j];
|
3482
|
+
}
|
3483
|
+
for (j = 0; j < i; j++)
|
3484
|
+
{
|
3485
|
+
if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
|
3486
|
+
}
|
3487
|
+
}
|
3488
|
+
}
|
3489
|
+
else
|
3490
|
+
{ const int ndata = transpose ? nrows : ncolumns;
|
3491
|
+
/* Set the metric function as indicated by dist */
|
3492
|
+
double (*metric)
|
3493
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
3494
|
+
setmetric(dist);
|
3495
|
+
|
3496
|
+
for (i = 0; i < nelements; i++)
|
3497
|
+
{ result[i].distance = DBL_MAX;
|
3498
|
+
for (j = 0; j < i; j++) temp[j] =
|
3499
|
+
metric(ndata, data, data, mask, mask, weight, i, j, transpose);
|
3500
|
+
for (j = 0; j < i; j++)
|
3501
|
+
{ k = vector[j];
|
3502
|
+
if (result[j].distance >= temp[j])
|
3503
|
+
{ if (result[j].distance < temp[k]) temp[k] = result[j].distance;
|
3504
|
+
result[j].distance = temp[j];
|
3505
|
+
vector[j] = i;
|
3506
|
+
}
|
3507
|
+
else if (temp[j] < temp[k]) temp[k] = temp[j];
|
3508
|
+
}
|
3509
|
+
for (j = 0; j < i; j++)
|
3510
|
+
if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
|
3511
|
+
}
|
3512
|
+
}
|
3513
|
+
free(temp);
|
3514
|
+
|
3515
|
+
for (i = 0; i < nnodes; i++) result[i].left = i;
|
3516
|
+
qsort(result, nnodes, sizeof(Node), nodecompare);
|
3517
|
+
|
3518
|
+
for (i = 0; i < nelements; i++) index[i] = i;
|
3519
|
+
for (i = 0; i < nnodes; i++)
|
3520
|
+
{ j = result[i].left;
|
3521
|
+
k = vector[j];
|
3522
|
+
result[i].left = index[j];
|
3523
|
+
result[i].right = index[k];
|
3524
|
+
index[k] = -i-1;
|
3525
|
+
}
|
3526
|
+
free(vector);
|
3527
|
+
free(index);
|
3528
|
+
|
3529
|
+
result = realloc(result, nnodes*sizeof(Node));
|
3530
|
+
|
3531
|
+
return result;
|
3532
|
+
}
|
3533
|
+
/* ******************************************************************** */
|
3534
|
+
|
3535
|
+
static Node* pmlcluster (int nelements, double** distmatrix)
|
3536
|
+
/*
|
3537
|
+
|
3538
|
+
Purpose
|
3539
|
+
=======
|
3540
|
+
|
3541
|
+
The pmlcluster routine performs clustering using pairwise maximum- (complete-)
|
3542
|
+
linking on the given distance matrix.
|
3543
|
+
|
3544
|
+
Arguments
|
3545
|
+
=========
|
3546
|
+
|
3547
|
+
nelements (input) int
|
3548
|
+
The number of elements to be clustered.
|
3549
|
+
|
3550
|
+
distmatrix (input) double**
|
3551
|
+
The distance matrix, with nelements rows, each row being filled up to the
|
3552
|
+
diagonal. The elements on the diagonal are not used, as they are assumed to be
|
3553
|
+
zero. The distance matrix will be modified by this routine.
|
3554
|
+
|
3555
|
+
Return value
|
3556
|
+
============
|
3557
|
+
|
3558
|
+
A pointer to a newly allocated array of Node structs, describing the
|
3559
|
+
hierarchical clustering solution consisting of nelements-1 nodes. Depending on
|
3560
|
+
whether genes (rows) or microarrays (columns) were clustered, nelements is
|
3561
|
+
equal to nrows or ncolumns. See src/cluster.h for a description of the Node
|
3562
|
+
structure.
|
3563
|
+
If a memory error occurs, pmlcluster returns NULL.
|
3564
|
+
========================================================================
|
3565
|
+
*/
|
3566
|
+
{ int j;
|
3567
|
+
int n;
|
3568
|
+
int* clusterid;
|
3569
|
+
Node* result;
|
3570
|
+
|
3571
|
+
clusterid = malloc(nelements*sizeof(int));
|
3572
|
+
if(!clusterid) return NULL;
|
3573
|
+
result = malloc((nelements-1)*sizeof(Node));
|
3574
|
+
if (!result)
|
3575
|
+
{ free(clusterid);
|
3576
|
+
return NULL;
|
3577
|
+
}
|
3578
|
+
|
3579
|
+
/* Setup a list specifying to which cluster a gene belongs */
|
3580
|
+
for (j = 0; j < nelements; j++) clusterid[j] = j;
|
3581
|
+
|
3582
|
+
for (n = nelements; n > 1; n--)
|
3583
|
+
{ int is = 1;
|
3584
|
+
int js = 0;
|
3585
|
+
result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
|
3586
|
+
|
3587
|
+
/* Fix the distances */
|
3588
|
+
for (j = 0; j < js; j++)
|
3589
|
+
distmatrix[js][j] = max(distmatrix[is][j],distmatrix[js][j]);
|
3590
|
+
for (j = js+1; j < is; j++)
|
3591
|
+
distmatrix[j][js] = max(distmatrix[is][j],distmatrix[j][js]);
|
3592
|
+
for (j = is+1; j < n; j++)
|
3593
|
+
distmatrix[j][js] = max(distmatrix[j][is],distmatrix[j][js]);
|
3594
|
+
|
3595
|
+
for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
|
3596
|
+
for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
|
3597
|
+
|
3598
|
+
/* Update clusterids */
|
3599
|
+
result[nelements-n].left = clusterid[is];
|
3600
|
+
result[nelements-n].right = clusterid[js];
|
3601
|
+
clusterid[js] = n-nelements-1;
|
3602
|
+
clusterid[is] = clusterid[n-1];
|
3603
|
+
}
|
3604
|
+
free(clusterid);
|
3605
|
+
|
3606
|
+
return result;
|
3607
|
+
}
|
3608
|
+
|
3609
|
+
/* ******************************************************************* */
|
3610
|
+
|
3611
|
+
static Node* palcluster (int nelements, double** distmatrix)
|
3612
|
+
/*
|
3613
|
+
Purpose
|
3614
|
+
=======
|
3615
|
+
|
3616
|
+
The palcluster routine performs clustering using pairwise average
|
3617
|
+
linking on the given distance matrix.
|
3618
|
+
|
3619
|
+
Arguments
|
3620
|
+
=========
|
3621
|
+
|
3622
|
+
nelements (input) int
|
3623
|
+
The number of elements to be clustered.
|
3624
|
+
|
3625
|
+
distmatrix (input) double**
|
3626
|
+
The distance matrix, with nelements rows, each row being filled up to the
|
3627
|
+
diagonal. The elements on the diagonal are not used, as they are assumed to be
|
3628
|
+
zero. The distance matrix will be modified by this routine.
|
3629
|
+
|
3630
|
+
Return value
|
3631
|
+
============
|
3632
|
+
|
3633
|
+
A pointer to a newly allocated array of Node structs, describing the
|
3634
|
+
hierarchical clustering solution consisting of nelements-1 nodes. Depending on
|
3635
|
+
whether genes (rows) or microarrays (columns) were clustered, nelements is
|
3636
|
+
equal to nrows or ncolumns. See src/cluster.h for a description of the Node
|
3637
|
+
structure.
|
3638
|
+
If a memory error occurs, palcluster returns NULL.
|
3639
|
+
========================================================================
|
3640
|
+
*/
|
3641
|
+
{ int j;
|
3642
|
+
int n;
|
3643
|
+
int* clusterid;
|
3644
|
+
int* number;
|
3645
|
+
Node* result;
|
3646
|
+
|
3647
|
+
clusterid = malloc(nelements*sizeof(int));
|
3648
|
+
if(!clusterid) return NULL;
|
3649
|
+
number = malloc(nelements*sizeof(int));
|
3650
|
+
if(!number)
|
3651
|
+
{ free(clusterid);
|
3652
|
+
return NULL;
|
3653
|
+
}
|
3654
|
+
result = malloc((nelements-1)*sizeof(Node));
|
3655
|
+
if (!result)
|
3656
|
+
{ free(clusterid);
|
3657
|
+
free(number);
|
3658
|
+
return NULL;
|
3659
|
+
}
|
3660
|
+
|
3661
|
+
/* Setup a list specifying to which cluster a gene belongs, and keep track
|
3662
|
+
* of the number of elements in each cluster (needed to calculate the
|
3663
|
+
* average). */
|
3664
|
+
for (j = 0; j < nelements; j++)
|
3665
|
+
{ number[j] = 1;
|
3666
|
+
clusterid[j] = j;
|
3667
|
+
}
|
3668
|
+
|
3669
|
+
for (n = nelements; n > 1; n--)
|
3670
|
+
{ int sum;
|
3671
|
+
int is = 1;
|
3672
|
+
int js = 0;
|
3673
|
+
result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
|
3674
|
+
|
3675
|
+
/* Save result */
|
3676
|
+
result[nelements-n].left = clusterid[is];
|
3677
|
+
result[nelements-n].right = clusterid[js];
|
3678
|
+
|
3679
|
+
/* Fix the distances */
|
3680
|
+
sum = number[is] + number[js];
|
3681
|
+
for (j = 0; j < js; j++)
|
3682
|
+
{ distmatrix[js][j] = distmatrix[is][j]*number[is]
|
3683
|
+
+ distmatrix[js][j]*number[js];
|
3684
|
+
distmatrix[js][j] /= sum;
|
3685
|
+
}
|
3686
|
+
for (j = js+1; j < is; j++)
|
3687
|
+
{ distmatrix[j][js] = distmatrix[is][j]*number[is]
|
3688
|
+
+ distmatrix[j][js]*number[js];
|
3689
|
+
distmatrix[j][js] /= sum;
|
3690
|
+
}
|
3691
|
+
for (j = is+1; j < n; j++)
|
3692
|
+
{ distmatrix[j][js] = distmatrix[j][is]*number[is]
|
3693
|
+
+ distmatrix[j][js]*number[js];
|
3694
|
+
distmatrix[j][js] /= sum;
|
3695
|
+
}
|
3696
|
+
|
3697
|
+
for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
|
3698
|
+
for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
|
3699
|
+
|
3700
|
+
/* Update number of elements in the clusters */
|
3701
|
+
number[js] = sum;
|
3702
|
+
number[is] = number[n-1];
|
3703
|
+
|
3704
|
+
/* Update clusterids */
|
3705
|
+
clusterid[js] = n-nelements-1;
|
3706
|
+
clusterid[is] = clusterid[n-1];
|
3707
|
+
}
|
3708
|
+
free(clusterid);
|
3709
|
+
free(number);
|
3710
|
+
|
3711
|
+
return result;
|
3712
|
+
}
|
3713
|
+
|
3714
|
+
/* ******************************************************************* */
|
3715
|
+
|
3716
|
+
Node* treecluster (int nrows, int ncolumns, double** data, int** mask,
|
3717
|
+
double weight[], int transpose, char dist, char method, double** distmatrix)
|
3718
|
+
/*
|
3719
|
+
Purpose
|
3720
|
+
=======
|
3721
|
+
|
3722
|
+
The treecluster routine performs hierarchical clustering using pairwise
|
3723
|
+
single-, maximum-, centroid-, or average-linkage, as defined by method, on a
|
3724
|
+
given set of gene expression data, using the distance metric given by dist.
|
3725
|
+
If successful, the function returns a pointer to a newly allocated Tree struct
|
3726
|
+
containing the hierarchical clustering solution, and NULL if a memory error
|
3727
|
+
occurs. The pointer should be freed by the calling routine to prevent memory
|
3728
|
+
leaks.
|
3729
|
+
|
3730
|
+
Arguments
|
3731
|
+
=========
|
3732
|
+
|
3733
|
+
nrows (input) int
|
3734
|
+
The number of rows in the data matrix, equal to the number of genes.
|
3735
|
+
|
3736
|
+
ncolumns (input) int
|
3737
|
+
The number of columns in the data matrix, equal to the number of microarrays.
|
3738
|
+
|
3739
|
+
data (input) double[nrows][ncolumns]
|
3740
|
+
The array containing the data of the vectors to be clustered.
|
3741
|
+
|
3742
|
+
mask (input) int[nrows][ncolumns]
|
3743
|
+
This array shows which data values are missing. If mask[i][j]==0, then
|
3744
|
+
data[i][j] is missing.
|
3745
|
+
|
3746
|
+
weight (input) double array[n]
|
3747
|
+
The weights that are used to calculate the distance.
|
3748
|
+
|
3749
|
+
transpose (input) int
|
3750
|
+
If transpose==0, the rows of the matrix are clustered. Otherwise, columns
|
3751
|
+
of the matrix are clustered.
|
3752
|
+
|
3753
|
+
dist (input) char
|
3754
|
+
Defines which distance measure is used, as given by the table:
|
3755
|
+
dist=='e': Euclidean distance
|
3756
|
+
dist=='b': City-block distance
|
3757
|
+
dist=='c': correlation
|
3758
|
+
dist=='a': absolute value of the correlation
|
3759
|
+
dist=='u': uncentered correlation
|
3760
|
+
dist=='x': absolute uncentered correlation
|
3761
|
+
dist=='s': Spearman's rank correlation
|
3762
|
+
dist=='k': Kendall's tau
|
3763
|
+
For other values of dist, the default (Euclidean distance) is used.
|
3764
|
+
|
3765
|
+
method (input) char
|
3766
|
+
Defines which hierarchical clustering method is used:
|
3767
|
+
method=='s': pairwise single-linkage clustering
|
3768
|
+
method=='m': pairwise maximum- (or complete-) linkage clustering
|
3769
|
+
method=='a': pairwise average-linkage clustering
|
3770
|
+
method=='c': pairwise centroid-linkage clustering
|
3771
|
+
For the first three, either the distance matrix or the gene expression data is
|
3772
|
+
sufficient to perform the clustering algorithm. For pairwise centroid-linkage
|
3773
|
+
clustering, however, the gene expression data are always needed, even if the
|
3774
|
+
distance matrix itself is available.
|
3775
|
+
|
3776
|
+
distmatrix (input) double**
|
3777
|
+
The distance matrix. If the distance matrix is zero initially, the distance
|
3778
|
+
matrix will be allocated and calculated from the data by treecluster, and
|
3779
|
+
deallocated before treecluster returns. If the distance matrix is passed by the
|
3780
|
+
calling routine, treecluster will modify the contents of the distance matrix as
|
3781
|
+
part of the clustering algorithm, but will not deallocate it. The calling
|
3782
|
+
routine should deallocate the distance matrix after the return from treecluster.
|
3783
|
+
|
3784
|
+
Return value
|
3785
|
+
============
|
3786
|
+
|
3787
|
+
A pointer to a newly allocated array of Node structs, describing the
|
3788
|
+
hierarchical clustering solution consisting of nelements-1 nodes. Depending on
|
3789
|
+
whether genes (rows) or microarrays (columns) were clustered, nelements is
|
3790
|
+
equal to nrows or ncolumns. See src/cluster.h for a description of the Node
|
3791
|
+
structure.
|
3792
|
+
If a memory error occurs, treecluster returns NULL.
|
3793
|
+
|
3794
|
+
========================================================================
|
3795
|
+
*/
|
3796
|
+
{ Node* result = NULL;
|
3797
|
+
const int nelements = (transpose==0) ? nrows : ncolumns;
|
3798
|
+
const int ldistmatrix = (distmatrix==NULL && method!='s') ? 1 : 0;
|
3799
|
+
|
3800
|
+
if (nelements < 2) return NULL;
|
3801
|
+
|
3802
|
+
/* Calculate the distance matrix if the user didn't give it */
|
3803
|
+
if(ldistmatrix)
|
3804
|
+
{ distmatrix =
|
3805
|
+
distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose);
|
3806
|
+
if (!distmatrix) return NULL; /* Insufficient memory */
|
3807
|
+
}
|
3808
|
+
|
3809
|
+
switch(method)
|
3810
|
+
{ case 's':
|
3811
|
+
result = pslcluster(nrows, ncolumns, data, mask, weight, distmatrix,
|
3812
|
+
dist, transpose);
|
3813
|
+
break;
|
3814
|
+
case 'm':
|
3815
|
+
result = pmlcluster(nelements, distmatrix);
|
3816
|
+
break;
|
3817
|
+
case 'a':
|
3818
|
+
result = palcluster(nelements, distmatrix);
|
3819
|
+
break;
|
3820
|
+
case 'c':
|
3821
|
+
result = pclcluster(nrows, ncolumns, data, mask, weight, distmatrix,
|
3822
|
+
dist, transpose);
|
3823
|
+
break;
|
3824
|
+
}
|
3825
|
+
|
3826
|
+
/* Deallocate space for distance matrix, if it was allocated by treecluster */
|
3827
|
+
if(ldistmatrix)
|
3828
|
+
{ int i;
|
3829
|
+
for (i = 1; i < nelements; i++) free(distmatrix[i]);
|
3830
|
+
free (distmatrix);
|
3831
|
+
}
|
3832
|
+
|
3833
|
+
return result;
|
3834
|
+
}
|
3835
|
+
|
3836
|
+
/* ******************************************************************* */
|
3837
|
+
|
3838
|
+
static
|
3839
|
+
void somworker (int nrows, int ncolumns, double** data, int** mask,
|
3840
|
+
const double weights[], int transpose, int nxgrid, int nygrid,
|
3841
|
+
double inittau, double*** celldata, int niter, char dist)
|
3842
|
+
|
3843
|
+
{ const int nelements = (transpose==0) ? nrows : ncolumns;
|
3844
|
+
const int ndata = (transpose==0) ? ncolumns : nrows;
|
3845
|
+
int i, j;
|
3846
|
+
double* stddata = calloc(nelements,sizeof(double));
|
3847
|
+
int** dummymask;
|
3848
|
+
int ix, iy;
|
3849
|
+
int* index;
|
3850
|
+
int iter;
|
3851
|
+
/* Maximum radius in which nodes are adjusted */
|
3852
|
+
double maxradius = sqrt(nxgrid*nxgrid+nygrid*nygrid);
|
3853
|
+
|
3854
|
+
/* Set the metric function as indicated by dist */
|
3855
|
+
double (*metric)
|
3856
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
3857
|
+
setmetric(dist);
|
3858
|
+
|
3859
|
+
/* Calculate the standard deviation for each row or column */
|
3860
|
+
if (transpose==0)
|
3861
|
+
{ for (i = 0; i < nelements; i++)
|
3862
|
+
{ int n = 0;
|
3863
|
+
for (j = 0; j < ndata; j++)
|
3864
|
+
{ if (mask[i][j])
|
3865
|
+
{ double term = data[i][j];
|
3866
|
+
term = term * term;
|
3867
|
+
stddata[i] += term;
|
3868
|
+
n++;
|
3869
|
+
}
|
3870
|
+
}
|
3871
|
+
if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
|
3872
|
+
else stddata[i] = 1;
|
3873
|
+
}
|
3874
|
+
}
|
3875
|
+
else
|
3876
|
+
{ for (i = 0; i < nelements; i++)
|
3877
|
+
{ int n = 0;
|
3878
|
+
for (j = 0; j < ndata; j++)
|
3879
|
+
{ if (mask[j][i])
|
3880
|
+
{ double term = data[j][i];
|
3881
|
+
term = term * term;
|
3882
|
+
stddata[i] += term;
|
3883
|
+
n++;
|
3884
|
+
}
|
3885
|
+
}
|
3886
|
+
if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n);
|
3887
|
+
else stddata[i] = 1;
|
3888
|
+
}
|
3889
|
+
}
|
3890
|
+
|
3891
|
+
if (transpose==0)
|
3892
|
+
{ dummymask = malloc(nygrid*sizeof(int*));
|
3893
|
+
for (i = 0; i < nygrid; i++)
|
3894
|
+
{ dummymask[i] = malloc(ndata*sizeof(int));
|
3895
|
+
for (j = 0; j < ndata; j++) dummymask[i][j] = 1;
|
3896
|
+
}
|
3897
|
+
}
|
3898
|
+
else
|
3899
|
+
{ dummymask = malloc(ndata*sizeof(int*));
|
3900
|
+
for (i = 0; i < ndata; i++)
|
3901
|
+
{ dummymask[i] = malloc(sizeof(int));
|
3902
|
+
dummymask[i][0] = 1;
|
3903
|
+
}
|
3904
|
+
}
|
3905
|
+
|
3906
|
+
/* Randomly initialize the nodes */
|
3907
|
+
for (ix = 0; ix < nxgrid; ix++)
|
3908
|
+
{ for (iy = 0; iy < nygrid; iy++)
|
3909
|
+
{ double sum = 0.;
|
3910
|
+
for (i = 0; i < ndata; i++)
|
3911
|
+
{ double term = -1.0 + 2.0*uniform();
|
3912
|
+
celldata[ix][iy][i] = term;
|
3913
|
+
sum += term * term;
|
3914
|
+
}
|
3915
|
+
sum = sqrt(sum/ndata);
|
3916
|
+
for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
|
3917
|
+
}
|
3918
|
+
}
|
3919
|
+
|
3920
|
+
/* Randomize the order in which genes or arrays will be used */
|
3921
|
+
index = malloc(nelements*sizeof(int));
|
3922
|
+
for (i = 0; i < nelements; i++) index[i] = i;
|
3923
|
+
for (i = 0; i < nelements; i++)
|
3924
|
+
{ j = (int) (i + (nelements-i)*uniform());
|
3925
|
+
ix = index[j];
|
3926
|
+
index[j] = index[i];
|
3927
|
+
index[i] = ix;
|
3928
|
+
}
|
3929
|
+
|
3930
|
+
/* Start the iteration */
|
3931
|
+
for (iter = 0; iter < niter; iter++)
|
3932
|
+
{ int ixbest = 0;
|
3933
|
+
int iybest = 0;
|
3934
|
+
int iobject = iter % nelements;
|
3935
|
+
iobject = index[iobject];
|
3936
|
+
if (transpose==0)
|
3937
|
+
{ double closest = metric(ndata,data,celldata[ixbest],
|
3938
|
+
mask,dummymask,weights,iobject,iybest,transpose);
|
3939
|
+
double radius = maxradius * (1. - ((double)iter)/((double)niter));
|
3940
|
+
double tau = inittau * (1. - ((double)iter)/((double)niter));
|
3941
|
+
|
3942
|
+
for (ix = 0; ix < nxgrid; ix++)
|
3943
|
+
{ for (iy = 0; iy < nygrid; iy++)
|
3944
|
+
{ double distance =
|
3945
|
+
metric (ndata,data,celldata[ix],
|
3946
|
+
mask,dummymask,weights,iobject,iy,transpose);
|
3947
|
+
if (distance < closest)
|
3948
|
+
{ ixbest = ix;
|
3949
|
+
iybest = iy;
|
3950
|
+
closest = distance;
|
3951
|
+
}
|
3952
|
+
}
|
3953
|
+
}
|
3954
|
+
for (ix = 0; ix < nxgrid; ix++)
|
3955
|
+
{ for (iy = 0; iy < nygrid; iy++)
|
3956
|
+
{ if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))<radius)
|
3957
|
+
{ double sum = 0.;
|
3958
|
+
for (i = 0; i < ndata; i++)
|
3959
|
+
{ if (mask[iobject][i]==0) continue;
|
3960
|
+
celldata[ix][iy][i] +=
|
3961
|
+
tau * (data[iobject][i]/stddata[iobject]-celldata[ix][iy][i]);
|
3962
|
+
}
|
3963
|
+
for (i = 0; i < ndata; i++)
|
3964
|
+
{ double term = celldata[ix][iy][i];
|
3965
|
+
term = term * term;
|
3966
|
+
sum += term;
|
3967
|
+
}
|
3968
|
+
if (sum>0)
|
3969
|
+
{ sum = sqrt(sum/ndata);
|
3970
|
+
for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
|
3971
|
+
}
|
3972
|
+
}
|
3973
|
+
}
|
3974
|
+
}
|
3975
|
+
}
|
3976
|
+
else
|
3977
|
+
{ double closest;
|
3978
|
+
double** celldatavector = malloc(ndata*sizeof(double*));
|
3979
|
+
double radius = maxradius * (1. - ((double)iter)/((double)niter));
|
3980
|
+
double tau = inittau * (1. - ((double)iter)/((double)niter));
|
3981
|
+
|
3982
|
+
for (i = 0; i < ndata; i++)
|
3983
|
+
celldatavector[i] = &(celldata[ixbest][iybest][i]);
|
3984
|
+
closest = metric(ndata,data,celldatavector,
|
3985
|
+
mask,dummymask,weights,iobject,0,transpose);
|
3986
|
+
for (ix = 0; ix < nxgrid; ix++)
|
3987
|
+
{ for (iy = 0; iy < nygrid; iy++)
|
3988
|
+
{ double distance;
|
3989
|
+
for (i = 0; i < ndata; i++)
|
3990
|
+
celldatavector[i] = &(celldata[ixbest][iybest][i]);
|
3991
|
+
distance =
|
3992
|
+
metric (ndata,data,celldatavector,
|
3993
|
+
mask,dummymask,weights,iobject,0,transpose);
|
3994
|
+
if (distance < closest)
|
3995
|
+
{ ixbest = ix;
|
3996
|
+
iybest = iy;
|
3997
|
+
closest = distance;
|
3998
|
+
}
|
3999
|
+
}
|
4000
|
+
}
|
4001
|
+
free(celldatavector);
|
4002
|
+
for (ix = 0; ix < nxgrid; ix++)
|
4003
|
+
{ for (iy = 0; iy < nygrid; iy++)
|
4004
|
+
{ if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest))<radius)
|
4005
|
+
{ double sum = 0.;
|
4006
|
+
for (i = 0; i < ndata; i++)
|
4007
|
+
{ if (mask[i][iobject]==0) continue;
|
4008
|
+
celldata[ix][iy][i] +=
|
4009
|
+
tau * (data[i][iobject]/stddata[iobject]-celldata[ix][iy][i]);
|
4010
|
+
}
|
4011
|
+
for (i = 0; i < ndata; i++)
|
4012
|
+
{ double term = celldata[ix][iy][i];
|
4013
|
+
term = term * term;
|
4014
|
+
sum += term;
|
4015
|
+
}
|
4016
|
+
if (sum>0)
|
4017
|
+
{ sum = sqrt(sum/ndata);
|
4018
|
+
for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum;
|
4019
|
+
}
|
4020
|
+
}
|
4021
|
+
}
|
4022
|
+
}
|
4023
|
+
}
|
4024
|
+
}
|
4025
|
+
if (transpose==0)
|
4026
|
+
for (i = 0; i < nygrid; i++) free(dummymask[i]);
|
4027
|
+
else
|
4028
|
+
for (i = 0; i < ndata; i++) free(dummymask[i]);
|
4029
|
+
free(dummymask);
|
4030
|
+
free(stddata);
|
4031
|
+
free(index);
|
4032
|
+
return;
|
4033
|
+
}
|
4034
|
+
|
4035
|
+
/* ******************************************************************* */
|
4036
|
+
|
4037
|
+
static
|
4038
|
+
void somassign (int nrows, int ncolumns, double** data, int** mask,
|
4039
|
+
const double weights[], int transpose, int nxgrid, int nygrid,
|
4040
|
+
double*** celldata, char dist, int clusterid[][2])
|
4041
|
+
/* Collect clusterids */
|
4042
|
+
{ const int ndata = (transpose==0) ? ncolumns : nrows;
|
4043
|
+
int i,j;
|
4044
|
+
|
4045
|
+
/* Set the metric function as indicated by dist */
|
4046
|
+
double (*metric)
|
4047
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
4048
|
+
setmetric(dist);
|
4049
|
+
|
4050
|
+
if (transpose==0)
|
4051
|
+
{ int** dummymask = malloc(nygrid*sizeof(int*));
|
4052
|
+
for (i = 0; i < nygrid; i++)
|
4053
|
+
{ dummymask[i] = malloc(ncolumns*sizeof(int));
|
4054
|
+
for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1;
|
4055
|
+
}
|
4056
|
+
for (i = 0; i < nrows; i++)
|
4057
|
+
{ int ixbest = 0;
|
4058
|
+
int iybest = 0;
|
4059
|
+
double closest = metric(ndata,data,celldata[ixbest],
|
4060
|
+
mask,dummymask,weights,i,iybest,transpose);
|
4061
|
+
int ix, iy;
|
4062
|
+
for (ix = 0; ix < nxgrid; ix++)
|
4063
|
+
{ for (iy = 0; iy < nygrid; iy++)
|
4064
|
+
{ double distance =
|
4065
|
+
metric (ndata,data,celldata[ix],
|
4066
|
+
mask,dummymask,weights,i,iy,transpose);
|
4067
|
+
if (distance < closest)
|
4068
|
+
{ ixbest = ix;
|
4069
|
+
iybest = iy;
|
4070
|
+
closest = distance;
|
4071
|
+
}
|
4072
|
+
}
|
4073
|
+
}
|
4074
|
+
clusterid[i][0] = ixbest;
|
4075
|
+
clusterid[i][1] = iybest;
|
4076
|
+
}
|
4077
|
+
for (i = 0; i < nygrid; i++) free(dummymask[i]);
|
4078
|
+
free(dummymask);
|
4079
|
+
}
|
4080
|
+
else
|
4081
|
+
{ double** celldatavector = malloc(ndata*sizeof(double*));
|
4082
|
+
int** dummymask = malloc(nrows*sizeof(int*));
|
4083
|
+
int ixbest = 0;
|
4084
|
+
int iybest = 0;
|
4085
|
+
for (i = 0; i < nrows; i++)
|
4086
|
+
{ dummymask[i] = malloc(sizeof(int));
|
4087
|
+
dummymask[i][0] = 1;
|
4088
|
+
}
|
4089
|
+
for (i = 0; i < ncolumns; i++)
|
4090
|
+
{ double closest;
|
4091
|
+
int ix, iy;
|
4092
|
+
for (j = 0; j < ndata; j++)
|
4093
|
+
celldatavector[j] = &(celldata[ixbest][iybest][j]);
|
4094
|
+
closest = metric(ndata,data,celldatavector,
|
4095
|
+
mask,dummymask,weights,i,0,transpose);
|
4096
|
+
for (ix = 0; ix < nxgrid; ix++)
|
4097
|
+
{ for (iy = 0; iy < nygrid; iy++)
|
4098
|
+
{ double distance;
|
4099
|
+
for(j = 0; j < ndata; j++)
|
4100
|
+
celldatavector[j] = &(celldata[ix][iy][j]);
|
4101
|
+
distance = metric(ndata,data,celldatavector,
|
4102
|
+
mask,dummymask,weights,i,0,transpose);
|
4103
|
+
if (distance < closest)
|
4104
|
+
{ ixbest = ix;
|
4105
|
+
iybest = iy;
|
4106
|
+
closest = distance;
|
4107
|
+
}
|
4108
|
+
}
|
4109
|
+
}
|
4110
|
+
clusterid[i][0] = ixbest;
|
4111
|
+
clusterid[i][1] = iybest;
|
4112
|
+
}
|
4113
|
+
free(celldatavector);
|
4114
|
+
for (i = 0; i < nrows; i++) free(dummymask[i]);
|
4115
|
+
free(dummymask);
|
4116
|
+
}
|
4117
|
+
return;
|
4118
|
+
}
|
4119
|
+
|
4120
|
+
/* ******************************************************************* */
|
4121
|
+
|
4122
|
+
void somcluster (int nrows, int ncolumns, double** data, int** mask,
|
4123
|
+
const double weight[], int transpose, int nxgrid, int nygrid,
|
4124
|
+
double inittau, int niter, char dist, double*** celldata, int clusterid[][2])
|
4125
|
+
/*
|
4126
|
+
|
4127
|
+
Purpose
|
4128
|
+
=======
|
4129
|
+
|
4130
|
+
The somcluster routine implements a self-organizing map (Kohonen) on a
|
4131
|
+
rectangular grid, using a given set of vectors. The distance measure to be
|
4132
|
+
used to find the similarity between genes and nodes is given by dist.
|
4133
|
+
|
4134
|
+
Arguments
|
4135
|
+
=========
|
4136
|
+
|
4137
|
+
nrows (input) int
|
4138
|
+
The number of rows in the data matrix, equal to the number of genes.
|
4139
|
+
|
4140
|
+
ncolumns (input) int
|
4141
|
+
The number of columns in the data matrix, equal to the number of microarrays.
|
4142
|
+
|
4143
|
+
data (input) double[nrows][ncolumns]
|
4144
|
+
The array containing the gene expression data.
|
4145
|
+
|
4146
|
+
mask (input) int[nrows][ncolumns]
|
4147
|
+
This array shows which data values are missing. If
|
4148
|
+
mask[i][j] == 0, then data[i][j] is missing.
|
4149
|
+
|
4150
|
+
weights (input) double[ncolumns] if transpose==0;
|
4151
|
+
double[nrows] if transpose==1
|
4152
|
+
The weights that are used to calculate the distance. The length of this vector
|
4153
|
+
is ncolumns if genes are being clustered, or nrows if microarrays are being
|
4154
|
+
clustered.
|
4155
|
+
|
4156
|
+
transpose (input) int
|
4157
|
+
If transpose==0, the rows (genes) of the matrix are clustered. Otherwise,
|
4158
|
+
columns (microarrays) of the matrix are clustered.
|
4159
|
+
|
4160
|
+
nxgrid (input) int
|
4161
|
+
The number of grid cells horizontally in the rectangular topology of clusters.
|
4162
|
+
|
4163
|
+
nygrid (input) int
|
4164
|
+
The number of grid cells horizontally in the rectangular topology of clusters.
|
4165
|
+
|
4166
|
+
inittau (input) double
|
4167
|
+
The initial value of tau, representing the neighborhood function.
|
4168
|
+
|
4169
|
+
niter (input) int
|
4170
|
+
The number of iterations to be performed.
|
4171
|
+
|
4172
|
+
dist (input) char
|
4173
|
+
Defines which distance measure is used, as given by the table:
|
4174
|
+
dist=='e': Euclidean distance
|
4175
|
+
dist=='b': City-block distance
|
4176
|
+
dist=='c': correlation
|
4177
|
+
dist=='a': absolute value of the correlation
|
4178
|
+
dist=='u': uncentered correlation
|
4179
|
+
dist=='x': absolute uncentered correlation
|
4180
|
+
dist=='s': Spearman's rank correlation
|
4181
|
+
dist=='k': Kendall's tau
|
4182
|
+
For other values of dist, the default (Euclidean distance) is used.
|
4183
|
+
|
4184
|
+
celldata (output) double[nxgrid][nygrid][ncolumns] if transpose==0;
|
4185
|
+
double[nxgrid][nygrid][nrows] if tranpose==1
|
4186
|
+
The gene expression data for each node (cell) in the 2D grid. This can be
|
4187
|
+
interpreted as the centroid for the cluster corresponding to that cell. If
|
4188
|
+
celldata is NULL, then the centroids are not returned. If celldata is not
|
4189
|
+
NULL, enough space should be allocated to store the centroid data before callingsomcluster.
|
4190
|
+
|
4191
|
+
clusterid (output), int[nrows][2] if transpose==0;
|
4192
|
+
int[ncolumns][2] if transpose==1
|
4193
|
+
For each item (gene or microarray) that is clustered, the coordinates of the
|
4194
|
+
cell in the 2D grid to which the item was assigned. If clusterid is NULL, the
|
4195
|
+
cluster assignments are not returned. If clusterid is not NULL, enough memory
|
4196
|
+
should be allocated to store the clustering information before calling
|
4197
|
+
somcluster.
|
4198
|
+
|
4199
|
+
========================================================================
|
4200
|
+
*/
|
4201
|
+
{ const int nobjects = (transpose==0) ? nrows : ncolumns;
|
4202
|
+
const int ndata = (transpose==0) ? ncolumns : nrows;
|
4203
|
+
int i,j;
|
4204
|
+
const int lcelldata = (celldata==NULL) ? 0 : 1;
|
4205
|
+
|
4206
|
+
if (nobjects < 2) return;
|
4207
|
+
|
4208
|
+
if (lcelldata==0)
|
4209
|
+
{ celldata = malloc(nxgrid*nygrid*ndata*sizeof(double**));
|
4210
|
+
for (i = 0; i < nxgrid; i++)
|
4211
|
+
{ celldata[i] = malloc(nygrid*ndata*sizeof(double*));
|
4212
|
+
for (j = 0; j < nygrid; j++)
|
4213
|
+
celldata[i][j] = malloc(ndata*sizeof(double));
|
4214
|
+
}
|
4215
|
+
}
|
4216
|
+
|
4217
|
+
somworker (nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid,
|
4218
|
+
inittau, celldata, niter, dist);
|
4219
|
+
if (clusterid)
|
4220
|
+
somassign (nrows, ncolumns, data, mask, weight, transpose,
|
4221
|
+
nxgrid, nygrid, celldata, dist, clusterid);
|
4222
|
+
if(lcelldata==0)
|
4223
|
+
{ for (i = 0; i < nxgrid; i++)
|
4224
|
+
for (j = 0; j < nygrid; j++)
|
4225
|
+
free(celldata[i][j]);
|
4226
|
+
for (i = 0; i < nxgrid; i++)
|
4227
|
+
free(celldata[i]);
|
4228
|
+
free(celldata);
|
4229
|
+
}
|
4230
|
+
return;
|
4231
|
+
}
|
4232
|
+
|
4233
|
+
/* ******************************************************************** */
|
4234
|
+
|
4235
|
+
double clusterdistance (int nrows, int ncolumns, double** data,
|
4236
|
+
int** mask, double weight[], int n1, int n2, int index1[], int index2[],
|
4237
|
+
char dist, char method, int transpose)
|
4238
|
+
|
4239
|
+
/*
|
4240
|
+
Purpose
|
4241
|
+
=======
|
4242
|
+
|
4243
|
+
The clusterdistance routine calculates the distance between two clusters
|
4244
|
+
containing genes or microarrays using the measured gene expression vectors. The
|
4245
|
+
distance between clusters, given the genes/microarrays in each cluster, can be
|
4246
|
+
defined in several ways. Several distance measures can be used.
|
4247
|
+
|
4248
|
+
The routine returns the distance in double precision.
|
4249
|
+
If the parameter transpose is set to a nonzero value, the clusters are
|
4250
|
+
interpreted as clusters of microarrays, otherwise as clusters of gene.
|
4251
|
+
|
4252
|
+
Arguments
|
4253
|
+
=========
|
4254
|
+
|
4255
|
+
nrows (input) int
|
4256
|
+
The number of rows (i.e., the number of genes) in the gene expression data
|
4257
|
+
matrix.
|
4258
|
+
|
4259
|
+
ncolumns (input) int
|
4260
|
+
The number of columns (i.e., the number of microarrays) in the gene expression
|
4261
|
+
data matrix.
|
4262
|
+
|
4263
|
+
data (input) double[nrows][ncolumns]
|
4264
|
+
The array containing the data of the vectors.
|
4265
|
+
|
4266
|
+
mask (input) int[nrows][ncolumns]
|
4267
|
+
This array shows which data values are missing. If mask[i][j]==0, then
|
4268
|
+
data[i][j] is missing.
|
4269
|
+
|
4270
|
+
weight (input) double[ncolumns] if transpose==0;
|
4271
|
+
double[nrows] if transpose==1
|
4272
|
+
The weights that are used to calculate the distance.
|
4273
|
+
|
4274
|
+
n1 (input) int
|
4275
|
+
The number of elements in the first cluster.
|
4276
|
+
|
4277
|
+
n2 (input) int
|
4278
|
+
The number of elements in the second cluster.
|
4279
|
+
|
4280
|
+
index1 (input) int[n1]
|
4281
|
+
Identifies which genes/microarrays belong to the first cluster.
|
4282
|
+
|
4283
|
+
index2 (input) int[n2]
|
4284
|
+
Identifies which genes/microarrays belong to the second cluster.
|
4285
|
+
|
4286
|
+
dist (input) char
|
4287
|
+
Defines which distance measure is used, as given by the table:
|
4288
|
+
dist=='e': Euclidean distance
|
4289
|
+
dist=='b': City-block distance
|
4290
|
+
dist=='c': correlation
|
4291
|
+
dist=='a': absolute value of the correlation
|
4292
|
+
dist=='u': uncentered correlation
|
4293
|
+
dist=='x': absolute uncentered correlation
|
4294
|
+
dist=='s': Spearman's rank correlation
|
4295
|
+
dist=='k': Kendall's tau
|
4296
|
+
For other values of dist, the default (Euclidean distance) is used.
|
4297
|
+
|
4298
|
+
method (input) char
|
4299
|
+
Defines how the distance between two clusters is defined, given which genes
|
4300
|
+
belong to which cluster:
|
4301
|
+
method=='a': the distance between the arithmetic means of the two clusters
|
4302
|
+
method=='m': the distance between the medians of the two clusters
|
4303
|
+
method=='s': the smallest pairwise distance between members of the two clusters
|
4304
|
+
method=='x': the largest pairwise distance between members of the two clusters
|
4305
|
+
method=='v': average of the pairwise distances between members of the clusters
|
4306
|
+
|
4307
|
+
transpose (input) int
|
4308
|
+
If transpose is equal to zero, the distances between the rows is
|
4309
|
+
calculated. Otherwise, the distances between the columns is calculated.
|
4310
|
+
The former is needed when genes are being clustered; the latter is used
|
4311
|
+
when microarrays are being clustered.
|
4312
|
+
|
4313
|
+
========================================================================
|
4314
|
+
*/
|
4315
|
+
{ /* Set the metric function as indicated by dist */
|
4316
|
+
double (*metric)
|
4317
|
+
(int, double**, double**, int**, int**, const double[], int, int, int) =
|
4318
|
+
setmetric(dist);
|
4319
|
+
|
4320
|
+
/* if one or both clusters are empty, return */
|
4321
|
+
if (n1 < 1 || n2 < 1) return -1.0;
|
4322
|
+
/* Check the indices */
|
4323
|
+
if (transpose==0)
|
4324
|
+
{ int i;
|
4325
|
+
for (i = 0; i < n1; i++)
|
4326
|
+
{ int index = index1[i];
|
4327
|
+
if (index < 0 || index >= nrows) return -1.0;
|
4328
|
+
}
|
4329
|
+
for (i = 0; i < n2; i++)
|
4330
|
+
{ int index = index2[i];
|
4331
|
+
if (index < 0 || index >= nrows) return -1.0;
|
4332
|
+
}
|
4333
|
+
}
|
4334
|
+
else
|
4335
|
+
{ int i;
|
4336
|
+
for (i = 0; i < n1; i++)
|
4337
|
+
{ int index = index1[i];
|
4338
|
+
if (index < 0 || index >= ncolumns) return -1.0;
|
4339
|
+
}
|
4340
|
+
for (i = 0; i < n2; i++)
|
4341
|
+
{ int index = index2[i];
|
4342
|
+
if (index < 0 || index >= ncolumns) return -1.0;
|
4343
|
+
}
|
4344
|
+
}
|
4345
|
+
|
4346
|
+
switch (method)
|
4347
|
+
{ case 'a':
|
4348
|
+
{ /* Find the center */
|
4349
|
+
int i,j,k;
|
4350
|
+
if (transpose==0)
|
4351
|
+
{ double distance;
|
4352
|
+
double* cdata[2];
|
4353
|
+
int* cmask[2];
|
4354
|
+
int* count[2];
|
4355
|
+
count[0] = calloc(ncolumns,sizeof(int));
|
4356
|
+
count[1] = calloc(ncolumns,sizeof(int));
|
4357
|
+
cdata[0] = calloc(ncolumns,sizeof(double));
|
4358
|
+
cdata[1] = calloc(ncolumns,sizeof(double));
|
4359
|
+
cmask[0] = malloc(ncolumns*sizeof(int));
|
4360
|
+
cmask[1] = malloc(ncolumns*sizeof(int));
|
4361
|
+
for (i = 0; i < n1; i++)
|
4362
|
+
{ k = index1[i];
|
4363
|
+
for (j = 0; j < ncolumns; j++)
|
4364
|
+
if (mask[k][j] != 0)
|
4365
|
+
{ cdata[0][j] = cdata[0][j] + data[k][j];
|
4366
|
+
count[0][j] = count[0][j] + 1;
|
4367
|
+
}
|
4368
|
+
}
|
4369
|
+
for (i = 0; i < n2; i++)
|
4370
|
+
{ k = index2[i];
|
4371
|
+
for (j = 0; j < ncolumns; j++)
|
4372
|
+
if (mask[k][j] != 0)
|
4373
|
+
{ cdata[1][j] = cdata[1][j] + data[k][j];
|
4374
|
+
count[1][j] = count[1][j] + 1;
|
4375
|
+
}
|
4376
|
+
}
|
4377
|
+
for (i = 0; i < 2; i++)
|
4378
|
+
for (j = 0; j < ncolumns; j++)
|
4379
|
+
{ if (count[i][j]>0)
|
4380
|
+
{ cdata[i][j] = cdata[i][j] / count[i][j];
|
4381
|
+
cmask[i][j] = 1;
|
4382
|
+
}
|
4383
|
+
else
|
4384
|
+
cmask[i][j] = 0;
|
4385
|
+
}
|
4386
|
+
distance =
|
4387
|
+
metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
|
4388
|
+
for (i = 0; i < 2; i++)
|
4389
|
+
{ free (cdata[i]);
|
4390
|
+
free (cmask[i]);
|
4391
|
+
free (count[i]);
|
4392
|
+
}
|
4393
|
+
return distance;
|
4394
|
+
}
|
4395
|
+
else
|
4396
|
+
{ double distance;
|
4397
|
+
int** count = malloc(nrows*sizeof(int*));
|
4398
|
+
double** cdata = malloc(nrows*sizeof(double*));
|
4399
|
+
int** cmask = malloc(nrows*sizeof(int*));
|
4400
|
+
for (i = 0; i < nrows; i++)
|
4401
|
+
{ count[i] = calloc(2,sizeof(int));
|
4402
|
+
cdata[i] = calloc(2,sizeof(double));
|
4403
|
+
cmask[i] = malloc(2*sizeof(int));
|
4404
|
+
}
|
4405
|
+
for (i = 0; i < n1; i++)
|
4406
|
+
{ k = index1[i];
|
4407
|
+
for (j = 0; j < nrows; j++)
|
4408
|
+
{ if (mask[j][k] != 0)
|
4409
|
+
{ cdata[j][0] = cdata[j][0] + data[j][k];
|
4410
|
+
count[j][0] = count[j][0] + 1;
|
4411
|
+
}
|
4412
|
+
}
|
4413
|
+
}
|
4414
|
+
for (i = 0; i < n2; i++)
|
4415
|
+
{ k = index2[i];
|
4416
|
+
for (j = 0; j < nrows; j++)
|
4417
|
+
{ if (mask[j][k] != 0)
|
4418
|
+
{ cdata[j][1] = cdata[j][1] + data[j][k];
|
4419
|
+
count[j][1] = count[j][1] + 1;
|
4420
|
+
}
|
4421
|
+
}
|
4422
|
+
}
|
4423
|
+
for (i = 0; i < nrows; i++)
|
4424
|
+
for (j = 0; j < 2; j++)
|
4425
|
+
if (count[i][j]>0)
|
4426
|
+
{ cdata[i][j] = cdata[i][j] / count[i][j];
|
4427
|
+
cmask[i][j] = 1;
|
4428
|
+
}
|
4429
|
+
else
|
4430
|
+
cmask[i][j] = 0;
|
4431
|
+
distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
|
4432
|
+
for (i = 0; i < nrows; i++)
|
4433
|
+
{ free (count[i]);
|
4434
|
+
free (cdata[i]);
|
4435
|
+
free (cmask[i]);
|
4436
|
+
}
|
4437
|
+
free (count);
|
4438
|
+
free (cdata);
|
4439
|
+
free (cmask);
|
4440
|
+
return distance;
|
4441
|
+
}
|
4442
|
+
}
|
4443
|
+
case 'm':
|
4444
|
+
{ int i, j, k;
|
4445
|
+
if (transpose==0)
|
4446
|
+
{ double distance;
|
4447
|
+
double* temp = malloc(nrows*sizeof(double));
|
4448
|
+
double* cdata[2];
|
4449
|
+
int* cmask[2];
|
4450
|
+
for (i = 0; i < 2; i++)
|
4451
|
+
{ cdata[i] = malloc(ncolumns*sizeof(double));
|
4452
|
+
cmask[i] = malloc(ncolumns*sizeof(int));
|
4453
|
+
}
|
4454
|
+
for (j = 0; j < ncolumns; j++)
|
4455
|
+
{ int count = 0;
|
4456
|
+
for (k = 0; k < n1; k++)
|
4457
|
+
{ i = index1[k];
|
4458
|
+
if (mask[i][j])
|
4459
|
+
{ temp[count] = data[i][j];
|
4460
|
+
count++;
|
4461
|
+
}
|
4462
|
+
}
|
4463
|
+
if (count>0)
|
4464
|
+
{ cdata[0][j] = median (count,temp);
|
4465
|
+
cmask[0][j] = 1;
|
4466
|
+
}
|
4467
|
+
else
|
4468
|
+
{ cdata[0][j] = 0.;
|
4469
|
+
cmask[0][j] = 0;
|
4470
|
+
}
|
4471
|
+
}
|
4472
|
+
for (j = 0; j < ncolumns; j++)
|
4473
|
+
{ int count = 0;
|
4474
|
+
for (k = 0; k < n2; k++)
|
4475
|
+
{ i = index2[k];
|
4476
|
+
if (mask[i][j])
|
4477
|
+
{ temp[count] = data[i][j];
|
4478
|
+
count++;
|
4479
|
+
}
|
4480
|
+
}
|
4481
|
+
if (count>0)
|
4482
|
+
{ cdata[1][j] = median (count,temp);
|
4483
|
+
cmask[1][j] = 1;
|
4484
|
+
}
|
4485
|
+
else
|
4486
|
+
{ cdata[1][j] = 0.;
|
4487
|
+
cmask[1][j] = 0;
|
4488
|
+
}
|
4489
|
+
}
|
4490
|
+
distance = metric (ncolumns,cdata,cdata,cmask,cmask,weight,0,1,0);
|
4491
|
+
for (i = 0; i < 2; i++)
|
4492
|
+
{ free (cdata[i]);
|
4493
|
+
free (cmask[i]);
|
4494
|
+
}
|
4495
|
+
free(temp);
|
4496
|
+
return distance;
|
4497
|
+
}
|
4498
|
+
else
|
4499
|
+
{ double distance;
|
4500
|
+
double* temp = malloc(ncolumns*sizeof(double));
|
4501
|
+
double** cdata = malloc(nrows*sizeof(double*));
|
4502
|
+
int** cmask = malloc(nrows*sizeof(int*));
|
4503
|
+
for (i = 0; i < nrows; i++)
|
4504
|
+
{ cdata[i] = malloc(2*sizeof(double));
|
4505
|
+
cmask[i] = malloc(2*sizeof(int));
|
4506
|
+
}
|
4507
|
+
for (j = 0; j < nrows; j++)
|
4508
|
+
{ int count = 0;
|
4509
|
+
for (k = 0; k < n1; k++)
|
4510
|
+
{ i = index1[k];
|
4511
|
+
if (mask[j][i])
|
4512
|
+
{ temp[count] = data[j][i];
|
4513
|
+
count++;
|
4514
|
+
}
|
4515
|
+
}
|
4516
|
+
if (count>0)
|
4517
|
+
{ cdata[j][0] = median (count,temp);
|
4518
|
+
cmask[j][0] = 1;
|
4519
|
+
}
|
4520
|
+
else
|
4521
|
+
{ cdata[j][0] = 0.;
|
4522
|
+
cmask[j][0] = 0;
|
4523
|
+
}
|
4524
|
+
}
|
4525
|
+
for (j = 0; j < nrows; j++)
|
4526
|
+
{ int count = 0;
|
4527
|
+
for (k = 0; k < n2; k++)
|
4528
|
+
{ i = index2[k];
|
4529
|
+
if (mask[j][i])
|
4530
|
+
{ temp[count] = data[j][i];
|
4531
|
+
count++;
|
4532
|
+
}
|
4533
|
+
}
|
4534
|
+
if (count>0)
|
4535
|
+
{ cdata[j][1] = median (count,temp);
|
4536
|
+
cmask[j][1] = 1;
|
4537
|
+
}
|
4538
|
+
else
|
4539
|
+
{ cdata[j][1] = 0.;
|
4540
|
+
cmask[j][1] = 0;
|
4541
|
+
}
|
4542
|
+
}
|
4543
|
+
distance = metric (nrows,cdata,cdata,cmask,cmask,weight,0,1,1);
|
4544
|
+
for (i = 0; i < nrows; i++)
|
4545
|
+
{ free (cdata[i]);
|
4546
|
+
free (cmask[i]);
|
4547
|
+
}
|
4548
|
+
free(cdata);
|
4549
|
+
free(cmask);
|
4550
|
+
free(temp);
|
4551
|
+
return distance;
|
4552
|
+
}
|
4553
|
+
}
|
4554
|
+
case 's':
|
4555
|
+
{ int i1, i2, j1, j2;
|
4556
|
+
const int n = (transpose==0) ? ncolumns : nrows;
|
4557
|
+
double mindistance = DBL_MAX;
|
4558
|
+
for (i1 = 0; i1 < n1; i1++)
|
4559
|
+
for (i2 = 0; i2 < n2; i2++)
|
4560
|
+
{ double distance;
|
4561
|
+
j1 = index1[i1];
|
4562
|
+
j2 = index2[i2];
|
4563
|
+
distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
|
4564
|
+
if (distance < mindistance) mindistance = distance;
|
4565
|
+
}
|
4566
|
+
return mindistance;
|
4567
|
+
}
|
4568
|
+
case 'x':
|
4569
|
+
{ int i1, i2, j1, j2;
|
4570
|
+
const int n = (transpose==0) ? ncolumns : nrows;
|
4571
|
+
double maxdistance = 0;
|
4572
|
+
for (i1 = 0; i1 < n1; i1++)
|
4573
|
+
for (i2 = 0; i2 < n2; i2++)
|
4574
|
+
{ double distance;
|
4575
|
+
j1 = index1[i1];
|
4576
|
+
j2 = index2[i2];
|
4577
|
+
distance = metric (n,data,data,mask,mask,weight,j1,j2,transpose);
|
4578
|
+
if (distance > maxdistance) maxdistance = distance;
|
4579
|
+
}
|
4580
|
+
return maxdistance;
|
4581
|
+
}
|
4582
|
+
case 'v':
|
4583
|
+
{ int i1, i2, j1, j2;
|
4584
|
+
const int n = (transpose==0) ? ncolumns : nrows;
|
4585
|
+
double distance = 0;
|
4586
|
+
for (i1 = 0; i1 < n1; i1++)
|
4587
|
+
for (i2 = 0; i2 < n2; i2++)
|
4588
|
+
{ j1 = index1[i1];
|
4589
|
+
j2 = index2[i2];
|
4590
|
+
distance += metric (n,data,data,mask,mask,weight,j1,j2,transpose);
|
4591
|
+
}
|
4592
|
+
distance /= (n1*n2);
|
4593
|
+
return distance;
|
4594
|
+
}
|
4595
|
+
}
|
4596
|
+
/* Never get here */
|
4597
|
+
return -2.0;
|
4598
|
+
}
|