ruby-em_algorithm 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +6 -0
- data/Gemfile.lock +30 -0
- data/README.md +44 -0
- data/Rakefile +7 -0
- data/example/.ex1.rb.swp +0 -0
- data/example/.ex2.rb.swp +0 -0
- data/example/.ex3-tmp.rb.swp +0 -0
- data/example/.ex3.rb.swp +0 -0
- data/example/data/2dim-gmm-new.txt +1267 -0
- data/example/data/2dim-gmm-simple.txt +676 -0
- data/example/data/2dim-gmm-test.txt +6565 -0
- data/example/data/2dim-gmm-test2.txt +2782 -0
- data/example/data/2dim-gmm-test3.csv +1641 -0
- data/example/data/2dim-gmm-test3.txt +2782 -0
- data/example/data/2dim-gmm-test4.csv +868 -0
- data/example/data/2dim-gmm-test4.txt +4924 -0
- data/example/data/2dim-gmm-without_weight-small.txt +2401 -0
- data/example/data/2dim-gmm-without_weight.txt +18001 -0
- data/example/data/2dim-gmm.txt +1267 -0
- data/example/data/gmm-new.txt +10001 -0
- data/example/data/gmm-simple.txt +676 -0
- data/example/data/gmm.txt +10001 -0
- data/example/data/old-gmm.txt +10000 -0
- data/example/ex1.rb +20 -0
- data/example/ex1.rb~ +20 -0
- data/example/ex2.rb +33 -0
- data/example/ex2.rb~ +33 -0
- data/example/ex3-tmp.rb +23 -0
- data/example/ex3-tmp.rb~ +25 -0
- data/example/ex3.rb +43 -0
- data/example/ex3.rb~ +43 -0
- data/example/tools/.2dim.rb.swp +0 -0
- data/example/tools/2dim.rb +69 -0
- data/example/tools/2dim.rb~ +69 -0
- data/example/tools/boxmuller.rb +28 -0
- data/example/tools/boxmuller.rb~ +28 -0
- data/example/tools/conv_from_yaml.rb +8 -0
- data/example/tools/conv_from_yaml_to_csv.rb +8 -0
- data/example/tools/conv_to_yaml.rb +17 -0
- data/example/tools/ellipsoid.gnuplot +63 -0
- data/example/tools/ellipsoid.gnuplot~ +64 -0
- data/example/tools/histogram.rb +19 -0
- data/example/tools/histogram2d.rb +20 -0
- data/example/tools/histogram2d.rb~ +18 -0
- data/example/tools/kmeans.rb +34 -0
- data/example/tools/mean.rb +19 -0
- data/example/tools/table.data +4618 -0
- data/example/tools/tmp.txt +69632 -0
- data/example/tools/xmeans.R +608 -0
- data/example/tools/xmeans.rb +35 -0
- data/lib/em_algorithm/.base.rb.swp +0 -0
- data/lib/em_algorithm/base.rb +116 -0
- data/lib/em_algorithm/base.rb~ +116 -0
- data/lib/em_algorithm/convergence/.chi_square.rb.swp +0 -0
- data/lib/em_algorithm/convergence/.likelihood.rb.swp +0 -0
- data/lib/em_algorithm/convergence/check_method.rb +4 -0
- data/lib/em_algorithm/convergence/check_method.rb~ +0 -0
- data/lib/em_algorithm/convergence/chi_square.rb +40 -0
- data/lib/em_algorithm/convergence/chi_square.rb~ +40 -0
- data/lib/em_algorithm/convergence/likelihood.rb +35 -0
- data/lib/em_algorithm/convergence/likelihood.rb~ +35 -0
- data/lib/em_algorithm/models/.gaussian.rb.swp +0 -0
- data/lib/em_algorithm/models/.md_gaussian.rb.swp +0 -0
- data/lib/em_algorithm/models/.mixture.rb.swp +0 -0
- data/lib/em_algorithm/models/.model.rb.swp +0 -0
- data/lib/em_algorithm/models/gaussian.rb +47 -0
- data/lib/em_algorithm/models/gaussian.rb~ +47 -0
- data/lib/em_algorithm/models/md_gaussian.rb +67 -0
- data/lib/em_algorithm/models/md_gaussian.rb~ +67 -0
- data/lib/em_algorithm/models/mixture.rb +122 -0
- data/lib/em_algorithm/models/mixture.rb~ +122 -0
- data/lib/em_algorithm/models/model.rb +19 -0
- data/lib/em_algorithm/models/model.rb~ +19 -0
- data/lib/ruby-em_algorithm.rb +3 -0
- data/lib/ruby-em_algorithm/version.rb +3 -0
- data/ruby-em_algorithm.gemspec +21 -0
- data/spec/spec_helper.rb +9 -0
- metadata +178 -0
@@ -0,0 +1,608 @@
|
|
1
|
+
# $Id: xmeans.prog,v 1.23 2012/04/12 11:21:12 tunenori Exp tunenori $
|
2
|
+
#
|
3
|
+
# X-MEANS Clustering
|
4
|
+
#
|
5
|
+
# Description:
|
6
|
+
#
|
7
|
+
# Perform x-maens non-hierarchal clustering on a data matrix.
|
8
|
+
#
|
9
|
+
# Usage:
|
10
|
+
#
|
11
|
+
# xmeans(x, ik = 2, iter.max = 10, pr.proc = F,
|
12
|
+
# ignore.covar = T, merge.cls = F)
|
13
|
+
#
|
14
|
+
# Arguments:
|
15
|
+
# x: A numeric matrix of data, or an object that can be coerced to
|
16
|
+
# such a matrix (such as a numeric vector or a data frame with
|
17
|
+
# all numeric columns).
|
18
|
+
#
|
19
|
+
# ik: The initial number of clusters applied to kmeans().
|
20
|
+
# As xmeans calls kmeans recursively, 'ik' should be sufficient
|
21
|
+
# small.
|
22
|
+
#
|
23
|
+
# iter.max: The maximum of iterations allowed.
|
24
|
+
#
|
25
|
+
# pr.proc: logical: If 'TRUE' the system outputs the processing status.
|
26
|
+
#
|
27
|
+
# ignore.covar: logical: If 'TRUE', covariances of cluster data are
|
28
|
+
# ignored. For saving of the time, 'TRUE' is set as the defalut.
|
29
|
+
#
|
30
|
+
# merge.cls: logical: If 'TRUE', some clusters may be merged into another
|
31
|
+
# clusters after iterative division.
|
32
|
+
#
|
33
|
+
# Value:
|
34
|
+
#
|
35
|
+
# An object of class 'xmeans' which is a list with components:
|
36
|
+
#
|
37
|
+
# cluster: A vector of integers indicating the cluster to which each
|
38
|
+
# point is allocated.
|
39
|
+
#
|
40
|
+
# centers: A matrix of cluster centres.
|
41
|
+
#
|
42
|
+
# size: The number of points in each cluster. When 'merge.cls' is TRUE,
|
43
|
+
# some elements may be zero.
|
44
|
+
#
|
45
|
+
# References:
|
46
|
+
#
|
47
|
+
# Ishioka, T. (2005): "An Expansion of X-means for Automatically
|
48
|
+
# Determining the Optimal Number of Clusters," The Fourth IASTED
|
49
|
+
# International Conference on Computational Intelligence (CI 2005),
|
50
|
+
# Calgary Canada, July 4-6, pp.91-96.
|
51
|
+
# http://www.rd.dnc.ac.jp/%7Etunenori/doc/487-053.pdf
|
52
|
+
#
|
53
|
+
# Ishioka, T. (2000): ``Extended K-means with an Efficient Estimation
|
54
|
+
# of the number of Clusters,'' Intelligent Data Engineering and
|
55
|
+
# Automated Learning --- IDEAL 2000, Second International Conference,
|
56
|
+
# Shatin, N.T., Hong Kong, China, December 2000, proceedings 17--22.
|
57
|
+
# (Lecture Notes in Computer Science 1983, Kwong Sak Leung, Lai-Wan
|
58
|
+
# Chan, Helen Meng (Eds.), Springer, 17--22, 2000)
|
59
|
+
# http://www.rd.dnc.ac.jp/%7Etunenori/doc/xmeans_ideal2000.pdf
|
60
|
+
#
|
61
|
+
# Examples:
|
62
|
+
#
|
63
|
+
# xmeans(iris[,-5], merge.cls=T)
|
64
|
+
# plot(cmdscale(dist(iris[,-5])), cex=2, pch=as.numeric(iris[,5]))
|
65
|
+
#
|
66
|
+
|
67
|
+
xmeans <- function(x, ik = 2, iter.max = 10, pr.proc = F, ignore.covar = T, merge.cls = F){
|
68
|
+
if (ik < 2)
|
69
|
+
ik <- 2
|
70
|
+
x <- as.matrix(x)
|
71
|
+
p <- ncol(x) # p-dimensional multivariate
|
72
|
+
if (ignore.covar){
|
73
|
+
q <- 2 * p # number of parameters; mean and var for each "p"
|
74
|
+
}else{
|
75
|
+
q <- p * (p+3) / 2 # integer
|
76
|
+
}
|
77
|
+
cl<- kmeans(x,ik,iter.max)
|
78
|
+
cl.sub <- list()
|
79
|
+
|
80
|
+
for (i in 1:ik){ # for each cluster
|
81
|
+
y.ok <- (cl$cluster == i) # i-th cluster or not
|
82
|
+
yi <- matrix(x[y.ok], ncol=p) # extract i-th cluster
|
83
|
+
zi <- yi # save the data for graphics
|
84
|
+
yi.centers <- cl$centers[i,]
|
85
|
+
zi.centers <- yi.centers
|
86
|
+
yi.cluster <- cl$cluster[(cl$cluster == i)]
|
87
|
+
yi.cluster <- rep(1, length(yi.cluster))
|
88
|
+
# sub-cluster number should begin from 1
|
89
|
+
|
90
|
+
k1 <- 1 # cluster number
|
91
|
+
k2 <- k1 + 1
|
92
|
+
bic.prior <- NULL
|
93
|
+
stack <- list() # divided and unproceeded data are stacked
|
94
|
+
lnL0 <- lnL(yi, yi.centers, ignore.covar)
|
95
|
+
yi.lnL <- lnL0$lnL
|
96
|
+
yi.detVx <- lnL0$detVx
|
97
|
+
|
98
|
+
repeat{
|
99
|
+
|
100
|
+
# go through at least 1 time;
|
101
|
+
# y$subcluster exist...
|
102
|
+
if (pr.proc) cat (paste("k1 =", k1, ", k2 =", k2,"\n"))
|
103
|
+
if (nrow(yi) == 1){ # sample size is 1
|
104
|
+
break
|
105
|
+
}
|
106
|
+
y <- split2cls(yi, yi.centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar)
|
107
|
+
if (y$continue){ # splitting continue
|
108
|
+
yi.cluster <-
|
109
|
+
updtCrusterNum(y$continue, yi.cluster, k1, k2, y$subcluster)
|
110
|
+
zi.centers <-
|
111
|
+
updtCenters(y$continue, zi.centers, k1, k2, y$centers)
|
112
|
+
yi.lnL <-
|
113
|
+
updtlnL(y$continue, yi.lnL, k1, k2, y$lnL.post)
|
114
|
+
yi.detVx <-
|
115
|
+
updtdetVx(y$continue, yi.detVx, k1, k2, y$detVx.post)
|
116
|
+
}
|
117
|
+
|
118
|
+
if (pr.proc) print(y$subcluster)
|
119
|
+
if (pr.proc){ print(y$bic.prior)
|
120
|
+
print(y$bic.post)
|
121
|
+
# print(y$lnL.prior) # for debug
|
122
|
+
# print(y$lnL.post) # for debug
|
123
|
+
print(y$continue) }
|
124
|
+
# cat("zi.centers=\n") # for debug
|
125
|
+
# print(zi.centers) # for debug
|
126
|
+
if (!y$continue){ # no-node
|
127
|
+
if ((nstack <- length(stack))){ # there are stacked data
|
128
|
+
# extract the stacked data
|
129
|
+
if (pr.proc)
|
130
|
+
cat(paste("extract the stacked data (", nstack, ")...\n"))
|
131
|
+
yi <- stack[[nstack]]$data
|
132
|
+
yi.centers <- stack[[nstack]]$centers
|
133
|
+
bic.prior <- stack[[nstack]]$bic
|
134
|
+
lnL.prior <- stack[[nstack]]$lnL
|
135
|
+
detVx.prior <- stack[[nstack]]$detVx
|
136
|
+
k1 <- stack[[nstack]]$cls
|
137
|
+
k2 <- k2 # unchanged
|
138
|
+
# delete the data set
|
139
|
+
if (nstack > 1){
|
140
|
+
stack <- stack[1:(nstack-1)]
|
141
|
+
}else{
|
142
|
+
stack <- list() # no stacked data
|
143
|
+
}
|
144
|
+
next;
|
145
|
+
}
|
146
|
+
# no node and no stack
|
147
|
+
if (pr.proc) cat ("no node and no stack...\n")
|
148
|
+
break;
|
149
|
+
}
|
150
|
+
# splitting continues...
|
151
|
+
y1 <- y$clj1 # data
|
152
|
+
y2 <- y$clj2
|
153
|
+
yi.ctr1 <- y$centers[1,] # centers
|
154
|
+
yi.ctr2 <- y$centers[2,]
|
155
|
+
bic.prior1 <- y$bic.post[1] # bic
|
156
|
+
bic.prior2 <- y$bic.post[2]
|
157
|
+
lnL.prior1 <- y$lnL.post[1] # lnL
|
158
|
+
lnL.prior2 <- y$lnL.post[2]
|
159
|
+
detVx.prior1 <- y$detVx.post[1] # detVx
|
160
|
+
detVx.prior2 <- y$detVx.post[2]
|
161
|
+
|
162
|
+
# one-hand repeats recursively...
|
163
|
+
yi <- y1
|
164
|
+
yi.centers <- yi.ctr1
|
165
|
+
bic.prior <- bic.prior1
|
166
|
+
lnL.prior <- lnL.prior1
|
167
|
+
detVx.prior <- detVx.prior1
|
168
|
+
# other-hand is stacked...
|
169
|
+
if (pr.proc) cat ("stacking ...\n")
|
170
|
+
stack <- c(stack,
|
171
|
+
list(list(data=y2, centers=yi.ctr2,
|
172
|
+
bic=bic.prior2, lnL=lnL.prior2, detVx=detVx.prior2, cls=k2)))
|
173
|
+
# inclement the cluster number
|
174
|
+
k2 <- k2 + 1
|
175
|
+
|
176
|
+
} # end of repeat
|
177
|
+
|
178
|
+
# splitting done ...
|
179
|
+
if (pr.proc){
|
180
|
+
cat ("splitting done...\n")
|
181
|
+
cat (paste("main cluster =",i,"*******\n"))
|
182
|
+
}
|
183
|
+
cl.sub <- c(cl.sub, list(list(cluster = yi.cluster,
|
184
|
+
centers = zi.centers, lnL = yi.lnL, detVx = yi.detVx,
|
185
|
+
size = tabulate(yi.cluster))))
|
186
|
+
if (pr.proc){
|
187
|
+
print(cl.sub[[i]])
|
188
|
+
plot(zi, col=yi.cluster)
|
189
|
+
if (is.vector(zi.centers))
|
190
|
+
points(zi.centers[1], zi.centers[2], pch=8)
|
191
|
+
else # array
|
192
|
+
points(zi.centers,col=1:(length(zi.centers)/p),pch=8)
|
193
|
+
}
|
194
|
+
}
|
195
|
+
if (pr.proc) print(cl.sub)
|
196
|
+
xcl <- mergeResult(cl, cl.sub, ik)
|
197
|
+
|
198
|
+
if (merge.cls == F) {
|
199
|
+
return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size))
|
200
|
+
}
|
201
|
+
|
202
|
+
# merge after progressive dividing
|
203
|
+
#
|
204
|
+
if (pr.proc) cat("merging after progressive dividing ...\n")
|
205
|
+
|
206
|
+
k <- length(xcl$size) # final cluster number
|
207
|
+
if (k <= 2){ # minimum cluster number should be 2
|
208
|
+
if (pr.proc) cat("merging skipped ...\n")
|
209
|
+
return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size))
|
210
|
+
}
|
211
|
+
if (pr.proc){
|
212
|
+
cat("xcl$detVx=")
|
213
|
+
print(xcl$detVx)
|
214
|
+
cat("xcl$size=")
|
215
|
+
print(xcl$size)
|
216
|
+
}
|
217
|
+
|
218
|
+
klist <- sort.list(xcl$size) # "small" to "large" order of xcl$detVx list
|
219
|
+
if (pr.proc) print(klist)
|
220
|
+
for (i in 1:(k-1)){
|
221
|
+
for (j in (i+1):k){
|
222
|
+
k1 = klist[i]
|
223
|
+
k2 = klist[j]
|
224
|
+
if (pr.proc) cat(paste("inspecting the clusters", k1,"and", k2,"\n"))
|
225
|
+
|
226
|
+
z <- mergedBIC(x, xcl, k1, k2, q, ignore.covar, pr.proc)
|
227
|
+
if (z$ret == F){
|
228
|
+
# k1 or k2 has been merged.
|
229
|
+
# skip this roop
|
230
|
+
if (pr.proc) cat("skipping... k1=", k1, "k2=", k2,"\n")
|
231
|
+
next
|
232
|
+
}
|
233
|
+
if (z$bicdiv > z$bicmgd){
|
234
|
+
# we prefer merged model.
|
235
|
+
# replace larger cls. number to smaller cls. number
|
236
|
+
if (pr.proc) cat("replace cls.", k2, "to", k1,"\n")
|
237
|
+
xcl$cluster <- replace(xcl$cluster, (xcl$cluster == k2), k1)
|
238
|
+
xcl$size[k1] <- xcl$size[k1] + xcl$size[k2]
|
239
|
+
xcl$size[k2] <- 0
|
240
|
+
xcl$lnL[k1] <- z$lnLmgd
|
241
|
+
xcl$lnL[k2] <- 0
|
242
|
+
xcl$detVx[k1] <- z$detVxmgd
|
243
|
+
xcl$detVx[k2] <- 0
|
244
|
+
xcl$centers[k1,] <- z$ctrmgd
|
245
|
+
xcl$centers[k2,] <- 0
|
246
|
+
|
247
|
+
}
|
248
|
+
}
|
249
|
+
}
|
250
|
+
list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size)
|
251
|
+
}
|
252
|
+
|
253
|
+
|
254
|
+
|
255
|
+
|
256
|
+
# marge the result of sub-clustering;
|
257
|
+
# cluster numbers by first kmeans should be renumbered;
|
258
|
+
# the other centers and sizes are simply added.
|
259
|
+
# cl: the result of first kmeans
|
260
|
+
# cl.sub: the result of subclustering
|
261
|
+
# ik: cluster number adopted to kmeans.
|
262
|
+
mergeResult <- function(cl, cl.sub, ik){
|
263
|
+
cluster <- cl$cluster # main cluster
|
264
|
+
centers <- NULL
|
265
|
+
size <- NULL
|
266
|
+
lnL <- NULL
|
267
|
+
detVx <- NULL
|
268
|
+
|
269
|
+
k <- 0 # uniq cluster numbers; k should be decremental.
|
270
|
+
for (i in 1:ik)
|
271
|
+
k <- k + length(cl.sub[[i]]$size)
|
272
|
+
kk <- k
|
273
|
+
|
274
|
+
for (i in ik:1){ # loop for main clusters obtained by kmeans
|
275
|
+
xsub <- cl.sub[[i]]$cluster
|
276
|
+
iki <- ik -i +1
|
277
|
+
centers <- rbind(centers, cl.sub[[iki]]$centers)
|
278
|
+
size <- c(size, cl.sub[[iki]]$size)
|
279
|
+
lnL <- c(lnL, cl.sub[[iki]]$lnL)
|
280
|
+
detVx <- c(detVx, cl.sub[[iki]]$detVx)
|
281
|
+
|
282
|
+
for (j in length(cl.sub[[i]]$size):1){ # loop for subclusters
|
283
|
+
xsub <- replace(xsub, (xsub == j), k)
|
284
|
+
k <- k -1
|
285
|
+
}
|
286
|
+
cluster <- replace(cluster, (cluster == i), xsub)
|
287
|
+
}
|
288
|
+
if (k != 0) stop("mergeResult: assertion failed (k = 0)...")
|
289
|
+
dimnames(centers) <- list(1:kk, NULL)
|
290
|
+
list(cluster = cluster, centers = centers, lnL = lnL, detVx = detVx, size = size)
|
291
|
+
}
|
292
|
+
|
293
|
+
|
294
|
+
# update the cluster number by using the result of "split2cls()"
|
295
|
+
# continue: no splitting
|
296
|
+
# v: cluster numbers vector for initial cluster.
|
297
|
+
# k1: cluster numbers should be updated; "k1" becomes "k1" and "k2"
|
298
|
+
# xsub: sub-cluster numbers vector of "v" whose value is "k1";
|
299
|
+
# given "xsub" have 1 or 2.
|
300
|
+
updtCrusterNum <- function(continue, v, k1, k2, xsub){
|
301
|
+
if (!is.vector(v))
|
302
|
+
return(xsub)
|
303
|
+
if (!continue)
|
304
|
+
return(v)
|
305
|
+
if (k1 == k2)
|
306
|
+
stop("updtCrusterNum() : k1 and k2 should differ.")
|
307
|
+
|
308
|
+
# below is same algorithm; explicit array operation is slow in R.
|
309
|
+
# j <- 1
|
310
|
+
# for (i in 1:length(v)){
|
311
|
+
# if (v[i] == k1){
|
312
|
+
# if (xsub[j] == 2)
|
313
|
+
# v[i] <- k2
|
314
|
+
# j <- j + 1
|
315
|
+
# }
|
316
|
+
# }
|
317
|
+
# end of algorithm
|
318
|
+
xsub <- replace(xsub, (xsub == 2), k2) # changed
|
319
|
+
xsub <- replace(xsub, (xsub == 1), k1) # unchanged
|
320
|
+
v <- replace(v, (v == k1), xsub)
|
321
|
+
}
|
322
|
+
|
323
|
+
|
324
|
+
# update the cluster centers by using the result of "split2cls()"
|
325
|
+
# continue: no update
|
326
|
+
# org.centers: original centers matrix
|
327
|
+
# divided.centers: divided centers matrix; it has 2 rows.
|
328
|
+
updtCenters <- function(continue, org.centers, k1, k2, divided.centers){
|
329
|
+
if (!is.matrix(org.centers))
|
330
|
+
return(divided.centers)
|
331
|
+
if (!continue)
|
332
|
+
return(org.centers)
|
333
|
+
if (k1 == k2)
|
334
|
+
stop("updtCenters() : k1 and k2 should differ.")
|
335
|
+
|
336
|
+
z <- NULL
|
337
|
+
for (i in 1:max(k2, nrow(org.centers))){
|
338
|
+
if (i == k1)
|
339
|
+
z <- rbind(z, divided.centers[1,])
|
340
|
+
else if (i == k2)
|
341
|
+
z <- rbind(z, divided.centers[2,])
|
342
|
+
else
|
343
|
+
z <- rbind(z, org.centers[i,])
|
344
|
+
}
|
345
|
+
z
|
346
|
+
}
|
347
|
+
|
348
|
+
# update the lnL by using the result of "split2cls()"
|
349
|
+
# continue: no update
|
350
|
+
# org.lnL: original lnL vector
|
351
|
+
# divided.lnL: divided lnL vector having 2 elements.
|
352
|
+
updtlnL <- function(continue, org.lnL, k1, k2, divided.lnL){
|
353
|
+
if (!is.vector(org.lnL))
|
354
|
+
return(divided.lnL)
|
355
|
+
if (!continue)
|
356
|
+
return(org.lnL)
|
357
|
+
if (k1 == k2)
|
358
|
+
stop("updtlnL() : k1 and k2 should differ.")
|
359
|
+
|
360
|
+
z <- NULL
|
361
|
+
for (i in 1:max(k2, length(org.lnL))){
|
362
|
+
if (i == k1)
|
363
|
+
z <- c(z, divided.lnL[1])
|
364
|
+
else if (i == k2)
|
365
|
+
z <- c(z, divided.lnL[2])
|
366
|
+
else
|
367
|
+
z <- c(z, org.lnL[i])
|
368
|
+
}
|
369
|
+
z
|
370
|
+
}
|
371
|
+
|
372
|
+
# update the detVx by using the result of "split2cls()"
|
373
|
+
# continue: no update
|
374
|
+
# org.detVx: original detVx vector
|
375
|
+
# divided.detVx: divided detVx vector having 2 elements.
|
376
|
+
updtdetVx <- function(continue, org.detVx, k1, k2, divided.detVx){
|
377
|
+
if (!is.vector(org.detVx))
|
378
|
+
return(divided.detVx)
|
379
|
+
if (!continue)
|
380
|
+
return(org.detVx)
|
381
|
+
if (k1 == k2)
|
382
|
+
stop("updtdetVx() : k1 and k2 should differ.")
|
383
|
+
|
384
|
+
z <- NULL
|
385
|
+
for (i in 1:max(k2, length(org.detVx))){
|
386
|
+
if (i == k1)
|
387
|
+
z <- c(z, divided.detVx[1])
|
388
|
+
else if (i == k2)
|
389
|
+
z <- c(z, divided.detVx[2])
|
390
|
+
else
|
391
|
+
z <- c(z, org.detVx[i])
|
392
|
+
}
|
393
|
+
z
|
394
|
+
}
|
395
|
+
|
396
|
+
# split 2 clusters if we would prefer it based on BIC
|
397
|
+
# q: a number of parameters
|
398
|
+
# bic.prior: BIC which x is given; if bic.prior=NULL then we calculate
|
399
|
+
# lnL.prior: lnL which x is given; if bic.prior=NULL then we calculate
|
400
|
+
# detVx.prior: detVx which x is given; if bic.prior=NULL then we calculate
|
401
|
+
split2cls <- function(x, centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar){
|
402
|
+
if (is.null(bic.prior)){
|
403
|
+
pb <- priorBIC(x, centers, q, ignore.covar)
|
404
|
+
bic.prior <- pb$bic
|
405
|
+
lnL.prior <- pb$lnL
|
406
|
+
detVx.prior <- pb$detVx
|
407
|
+
}
|
408
|
+
bic.post <- postBICs(x, centers, q, iter.max, ignore.covar)
|
409
|
+
|
410
|
+
subcluster <- bic.post$clsub$cluster
|
411
|
+
#
|
412
|
+
# compare whether if we should split
|
413
|
+
if (is.na(bic.post$bic[3])){
|
414
|
+
# BIC may has NA because of few data
|
415
|
+
continue <- FALSE
|
416
|
+
}else if (bic.post$bic[3] < bic.prior){
|
417
|
+
# splitting ...
|
418
|
+
# replace the cluster number to cl$cluster
|
419
|
+
continue <- TRUE
|
420
|
+
}else{
|
421
|
+
# not splitting...
|
422
|
+
# return "subcluster" stored k1
|
423
|
+
continue <- FALSE
|
424
|
+
}
|
425
|
+
# note that "subcluster" gives 1 or 2
|
426
|
+
list(continue = continue, subcluster = subcluster,
|
427
|
+
bic.prior = bic.prior, bic.post = bic.post$bic,
|
428
|
+
lnL.prior = lnL.prior, lnL.post = bic.post$lnL,
|
429
|
+
detVx.prior = detVx.prior, detVx.post = bic.post$detVx,
|
430
|
+
centers = bic.post$clsub$centers,
|
431
|
+
clj1 = bic.post$clj1, clj2 = bic.post$clj2)
|
432
|
+
}
|
433
|
+
|
434
|
+
|
435
|
+
|
436
|
+
|
437
|
+
# return BIC (prior BIC)
|
438
|
+
priorBIC <- function(x, centers, q, ignore.covar){
|
439
|
+
lnL0 <- lnL(x, centers, ignore.covar)
|
440
|
+
bic <- -2 * lnL0$lnL + q * log(nrow(x)) # BIC
|
441
|
+
# bic <- -2 * lnL0$lnL + q # AIC
|
442
|
+
list(lnL = lnL0$lnL, detVx = lnL0$detVx, bic = bic)
|
443
|
+
}
|
444
|
+
|
445
|
+
|
446
|
+
# return BICs (two posterior BICs)
|
447
|
+
postBICs <- function(x, centers, q, iter.max, ignore.covar){
|
448
|
+
#
|
449
|
+
# split to 2 clusters
|
450
|
+
clsub <- kmeans(x, 2, iter.max)
|
451
|
+
y.ok1 <- lapply(clsub$cluster, "==", 1) # 1st sub-cluster or not
|
452
|
+
y.ok2 <- lapply(clsub$cluster, "==", 2) # 2nd sub-cluster or not
|
453
|
+
# extract sub data
|
454
|
+
p <- ncol(x)
|
455
|
+
clj1 <- matrix(x[as.logical(y.ok1)], ncol=p)
|
456
|
+
clj2 <- matrix(x[as.logical(y.ok2)], ncol=p)
|
457
|
+
# ratio for pdf.
|
458
|
+
r1 <- clsub$size[1] / sum(clsub$size) # [0,1]
|
459
|
+
r2 <- 1 - r1 # [0,1]
|
460
|
+
# two later BICs
|
461
|
+
# print(clsub$centers[1,]) # for debug
|
462
|
+
# print(apply(clj1,2,mean)) # for debug
|
463
|
+
# print(sqrt(apply(clj1,2,var))) # for debug
|
464
|
+
# print(r1) # for debug
|
465
|
+
lnL1 <- lnL(clj1, clsub$centers[1,], ignore.covar)
|
466
|
+
# print(clsub$centers[2,]) # for debug
|
467
|
+
# print(apply(clj2,2,mean)) # for debug
|
468
|
+
# print(sqrt(apply(clj2,2,var))) # for debug
|
469
|
+
# print(r2) # for debug
|
470
|
+
lnL2 <- lnL(clj2, clsub$centers[2,], ignore.covar)
|
471
|
+
n1 <- nrow(clj1)
|
472
|
+
n2 <- nrow(clj2)
|
473
|
+
# normalizing factor; dist() is in library(mva)
|
474
|
+
if (is.na(lnL1$detVx) || is.na(lnL2$detVx))
|
475
|
+
beta <- 0
|
476
|
+
else
|
477
|
+
beta <- dist(clsub$center) / (sqrt(lnL1$detVx + lnL2$detVx))
|
478
|
+
alpha <- 0.5 / pnorm(beta)
|
479
|
+
BIC1 <- -2 * lnL1$lnL +q * log(n1)
|
480
|
+
BIC2 <- -2 * lnL2$lnL +q * log(n2)
|
481
|
+
# BIC1 <- -2 * lnL1$lnL +q # AIC
|
482
|
+
# BIC2 <- -2 * lnL2$lnL +q # AIC
|
483
|
+
|
484
|
+
# cat (paste("alpha =",alpha,"\n")) # for debug
|
485
|
+
# cat (paste("beta =",beta,"\n")) # for debug
|
486
|
+
|
487
|
+
# BIC is not (BIC1 + BIC2)
|
488
|
+
BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha)
|
489
|
+
# BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC
|
490
|
+
list(bic = c(BIC1, BIC2, BIC),
|
491
|
+
lnL = c(lnL1$lnL, lnL2$lnL),
|
492
|
+
detVx = c(lnL1$detVx, lnL2$detVx),
|
493
|
+
clsub = clsub, clj1 = clj1, clj2 = clj2)
|
494
|
+
}
|
495
|
+
|
496
|
+
|
497
|
+
|
498
|
+
# return BICs for Two-merged clusters model and devided clusters model
|
499
|
+
# k1/k2: marged cluster ID
|
500
|
+
mergedBIC <- function(x, xcl, k1, k2, q, ignore.covar, pr.proc){
|
501
|
+
# sample size
|
502
|
+
# check for input data
|
503
|
+
n1 <- xcl$size[k1]
|
504
|
+
n2 <- xcl$size[k2]
|
505
|
+
if (n1 == 0 || n2 == 0){
|
506
|
+
# already had been merged
|
507
|
+
cat(paste("already had been merged\n"))
|
508
|
+
ret <- F
|
509
|
+
return( list (ret = ret))
|
510
|
+
}
|
511
|
+
if (is.null(xcl$lnL[k1]) || is.null(xcl$lnL[k2])){
|
512
|
+
# lnL may be null because of few data
|
513
|
+
cat(paste("lnL may be null because of few data\n"))
|
514
|
+
ret <- F
|
515
|
+
return( list (ret = ret))
|
516
|
+
}
|
517
|
+
|
518
|
+
# divided clusters model
|
519
|
+
lnL1 = xcl$lnL[k1]
|
520
|
+
lnL2 = xcl$lnL[k2]
|
521
|
+
ctrextrt <- rbind(xcl$centers[k1,], xcl$centers[k2,])
|
522
|
+
beta <- dist(ctrextrt) / (sqrt(xcl$detVx[k1] + xcl$detVx[k2]))
|
523
|
+
if (pr.proc) cat(paste("beta=", round (beta, digit=2), "\n"))
|
524
|
+
|
525
|
+
# if (beta > 10){
|
526
|
+
# # 2 clusters far apart
|
527
|
+
# ret <- F
|
528
|
+
# return( list (ret = ret))
|
529
|
+
# }
|
530
|
+
|
531
|
+
alpha <- 0.5 / as.numeric(pnorm(beta))
|
532
|
+
bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha)
|
533
|
+
# bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC
|
534
|
+
|
535
|
+
# extract 2 clusters data
|
536
|
+
y.ok1 <- lapply(xcl$cluster, "==", k1) # 1st sub-cluster or not
|
537
|
+
y.ok2 <- lapply(xcl$cluster, "==", k2) # 2nd sub-cluster or not
|
538
|
+
|
539
|
+
# extract sub data
|
540
|
+
p = ncol(x)
|
541
|
+
clj1 <- matrix(x[as.logical(y.ok1)], ncol=p)
|
542
|
+
clj2 <- matrix(x[as.logical(y.ok2)], ncol=p)
|
543
|
+
xmgd <- rbind(clj1, clj2)
|
544
|
+
|
545
|
+
# merged cluster center
|
546
|
+
ctrmgd <- (n1 * xcl$centers[k1,] + n2 * xcl$centers[k2,]) / (n1 + n2)
|
547
|
+
lnLmgd <- lnL(xmgd, ctrmgd, ignore.covar)
|
548
|
+
bicmgd <- -2 * lnLmgd$lnL + q * log(nrow(xmgd)) # BIC
|
549
|
+
# bicmgd <- -2 * lnLmgd$lnL + q # AIC
|
550
|
+
|
551
|
+
ret <- T
|
552
|
+
list (ret = ret, ctrmgd = ctrmgd, lnLmgd = lnLmgd$lnL, detVxmgd = lnLmgd$detVx, bicmgd = bicmgd, bicdiv = bicdiv)
|
553
|
+
}
|
554
|
+
|
555
|
+
|
556
|
+
|
557
|
+
|
558
|
+
|
559
|
+
# log-likelihood under the assumption of
|
560
|
+
# p-dimensional multivariate normal distribution.
|
561
|
+
# ignore.covar: ignore the covariance
|
562
|
+
lnL <- function(x, centers, ignore.covar=T){
|
563
|
+
x <- as.matrix(x)
|
564
|
+
p <- ncol(x) # p-dimensional multivariate
|
565
|
+
n <- nrow(x) # sample size
|
566
|
+
if (missing(centers))
|
567
|
+
stop("centers must be a number or a matrix")
|
568
|
+
if (n <= 2) # few data
|
569
|
+
return(list(lnL=NA, detVx=NA))
|
570
|
+
vx <- var(x) # var-co.var matrix
|
571
|
+
# print(x) # for debug
|
572
|
+
if (p == 1){ # x is vector
|
573
|
+
invVx <- 1 / as.vector(vx)
|
574
|
+
detVx <- as.vector(vx)
|
575
|
+
}else{
|
576
|
+
if (ignore.covar){
|
577
|
+
invVx <- diag(1/diag(vx)) # inv. matrix when assuming diag.
|
578
|
+
detVx <- prod(diag(vx)) # det. when assuming diag.
|
579
|
+
}else{
|
580
|
+
invVx <- solve(vx) # inverse matrix of "vx"
|
581
|
+
y <- chol(vx) # Cholesky decomposition
|
582
|
+
detVx <- prod(diag(y)) # vx = t(y) %*% y, where y is triangular,
|
583
|
+
# then, det(vx) = det(t(y)) * det(y)
|
584
|
+
}
|
585
|
+
}
|
586
|
+
t1 <- -p/2 * 1.837877066 # 1.837... = log(2 * 3.1415...)
|
587
|
+
t2 <- -log(detVx) / 2
|
588
|
+
xmu <- t(apply(x, 1, "-", centers))
|
589
|
+
# print(centers) # for debug
|
590
|
+
# print(xmu) # for debug
|
591
|
+
# s <- 0
|
592
|
+
# for (i in 1:n)
|
593
|
+
# s <- s + t(xmu[i,]) %*% invVx %*% xmu[i,]
|
594
|
+
if (p == 1){
|
595
|
+
s <- sum(xmu^2 * invVx)
|
596
|
+
}else{
|
597
|
+
s <- sum(apply(xmu, 1, txInvVxX, invVx=invVx))
|
598
|
+
}
|
599
|
+
t3 <- -s / 2
|
600
|
+
ll <- (t1 + t2) * n + as.numeric(t3) # log likelihood
|
601
|
+
list(lnL=ll, detVx=detVx)
|
602
|
+
}
|
603
|
+
|
604
|
+
# function for calculation of
|
605
|
+
# t(xmu[i,]) %*% invVx %*% xmu[i,]
|
606
|
+
txInvVxX <- function(x, invVx){
|
607
|
+
t(x) %*% invVx %*% x
|
608
|
+
}
|