ruby-em_algorithm 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +6 -0
- data/Gemfile.lock +30 -0
- data/README.md +44 -0
- data/Rakefile +7 -0
- data/example/.ex1.rb.swp +0 -0
- data/example/.ex2.rb.swp +0 -0
- data/example/.ex3-tmp.rb.swp +0 -0
- data/example/.ex3.rb.swp +0 -0
- data/example/data/2dim-gmm-new.txt +1267 -0
- data/example/data/2dim-gmm-simple.txt +676 -0
- data/example/data/2dim-gmm-test.txt +6565 -0
- data/example/data/2dim-gmm-test2.txt +2782 -0
- data/example/data/2dim-gmm-test3.csv +1641 -0
- data/example/data/2dim-gmm-test3.txt +2782 -0
- data/example/data/2dim-gmm-test4.csv +868 -0
- data/example/data/2dim-gmm-test4.txt +4924 -0
- data/example/data/2dim-gmm-without_weight-small.txt +2401 -0
- data/example/data/2dim-gmm-without_weight.txt +18001 -0
- data/example/data/2dim-gmm.txt +1267 -0
- data/example/data/gmm-new.txt +10001 -0
- data/example/data/gmm-simple.txt +676 -0
- data/example/data/gmm.txt +10001 -0
- data/example/data/old-gmm.txt +10000 -0
- data/example/ex1.rb +20 -0
- data/example/ex1.rb~ +20 -0
- data/example/ex2.rb +33 -0
- data/example/ex2.rb~ +33 -0
- data/example/ex3-tmp.rb +23 -0
- data/example/ex3-tmp.rb~ +25 -0
- data/example/ex3.rb +43 -0
- data/example/ex3.rb~ +43 -0
- data/example/tools/.2dim.rb.swp +0 -0
- data/example/tools/2dim.rb +69 -0
- data/example/tools/2dim.rb~ +69 -0
- data/example/tools/boxmuller.rb +28 -0
- data/example/tools/boxmuller.rb~ +28 -0
- data/example/tools/conv_from_yaml.rb +8 -0
- data/example/tools/conv_from_yaml_to_csv.rb +8 -0
- data/example/tools/conv_to_yaml.rb +17 -0
- data/example/tools/ellipsoid.gnuplot +63 -0
- data/example/tools/ellipsoid.gnuplot~ +64 -0
- data/example/tools/histogram.rb +19 -0
- data/example/tools/histogram2d.rb +20 -0
- data/example/tools/histogram2d.rb~ +18 -0
- data/example/tools/kmeans.rb +34 -0
- data/example/tools/mean.rb +19 -0
- data/example/tools/table.data +4618 -0
- data/example/tools/tmp.txt +69632 -0
- data/example/tools/xmeans.R +608 -0
- data/example/tools/xmeans.rb +35 -0
- data/lib/em_algorithm/.base.rb.swp +0 -0
- data/lib/em_algorithm/base.rb +116 -0
- data/lib/em_algorithm/base.rb~ +116 -0
- data/lib/em_algorithm/convergence/.chi_square.rb.swp +0 -0
- data/lib/em_algorithm/convergence/.likelihood.rb.swp +0 -0
- data/lib/em_algorithm/convergence/check_method.rb +4 -0
- data/lib/em_algorithm/convergence/check_method.rb~ +0 -0
- data/lib/em_algorithm/convergence/chi_square.rb +40 -0
- data/lib/em_algorithm/convergence/chi_square.rb~ +40 -0
- data/lib/em_algorithm/convergence/likelihood.rb +35 -0
- data/lib/em_algorithm/convergence/likelihood.rb~ +35 -0
- data/lib/em_algorithm/models/.gaussian.rb.swp +0 -0
- data/lib/em_algorithm/models/.md_gaussian.rb.swp +0 -0
- data/lib/em_algorithm/models/.mixture.rb.swp +0 -0
- data/lib/em_algorithm/models/.model.rb.swp +0 -0
- data/lib/em_algorithm/models/gaussian.rb +47 -0
- data/lib/em_algorithm/models/gaussian.rb~ +47 -0
- data/lib/em_algorithm/models/md_gaussian.rb +67 -0
- data/lib/em_algorithm/models/md_gaussian.rb~ +67 -0
- data/lib/em_algorithm/models/mixture.rb +122 -0
- data/lib/em_algorithm/models/mixture.rb~ +122 -0
- data/lib/em_algorithm/models/model.rb +19 -0
- data/lib/em_algorithm/models/model.rb~ +19 -0
- data/lib/ruby-em_algorithm.rb +3 -0
- data/lib/ruby-em_algorithm/version.rb +3 -0
- data/ruby-em_algorithm.gemspec +21 -0
- data/spec/spec_helper.rb +9 -0
- metadata +178 -0
@@ -0,0 +1,608 @@
|
|
1
|
+
# $Id: xmeans.prog,v 1.23 2012/04/12 11:21:12 tunenori Exp tunenori $
|
2
|
+
#
|
3
|
+
# X-MEANS Clustering
|
4
|
+
#
|
5
|
+
# Description:
|
6
|
+
#
|
7
|
+
# Perform x-maens non-hierarchal clustering on a data matrix.
|
8
|
+
#
|
9
|
+
# Usage:
|
10
|
+
#
|
11
|
+
# xmeans(x, ik = 2, iter.max = 10, pr.proc = F,
|
12
|
+
# ignore.covar = T, merge.cls = F)
|
13
|
+
#
|
14
|
+
# Arguments:
|
15
|
+
# x: A numeric matrix of data, or an object that can be coerced to
|
16
|
+
# such a matrix (such as a numeric vector or a data frame with
|
17
|
+
# all numeric columns).
|
18
|
+
#
|
19
|
+
# ik: The initial number of clusters applied to kmeans().
|
20
|
+
# As xmeans calls kmeans recursively, 'ik' should be sufficient
|
21
|
+
# small.
|
22
|
+
#
|
23
|
+
# iter.max: The maximum of iterations allowed.
|
24
|
+
#
|
25
|
+
# pr.proc: logical: If 'TRUE' the system outputs the processing status.
|
26
|
+
#
|
27
|
+
# ignore.covar: logical: If 'TRUE', covariances of cluster data are
|
28
|
+
# ignored. For saving of the time, 'TRUE' is set as the defalut.
|
29
|
+
#
|
30
|
+
# merge.cls: logical: If 'TRUE', some clusters may be merged into another
|
31
|
+
# clusters after iterative division.
|
32
|
+
#
|
33
|
+
# Value:
|
34
|
+
#
|
35
|
+
# An object of class 'xmeans' which is a list with components:
|
36
|
+
#
|
37
|
+
# cluster: A vector of integers indicating the cluster to which each
|
38
|
+
# point is allocated.
|
39
|
+
#
|
40
|
+
# centers: A matrix of cluster centres.
|
41
|
+
#
|
42
|
+
# size: The number of points in each cluster. When 'merge.cls' is TRUE,
|
43
|
+
# some elements may be zero.
|
44
|
+
#
|
45
|
+
# References:
|
46
|
+
#
|
47
|
+
# Ishioka, T. (2005): "An Expansion of X-means for Automatically
|
48
|
+
# Determining the Optimal Number of Clusters," The Fourth IASTED
|
49
|
+
# International Conference on Computational Intelligence (CI 2005),
|
50
|
+
# Calgary Canada, July 4-6, pp.91-96.
|
51
|
+
# http://www.rd.dnc.ac.jp/%7Etunenori/doc/487-053.pdf
|
52
|
+
#
|
53
|
+
# Ishioka, T. (2000): ``Extended K-means with an Efficient Estimation
|
54
|
+
# of the number of Clusters,'' Intelligent Data Engineering and
|
55
|
+
# Automated Learning --- IDEAL 2000, Second International Conference,
|
56
|
+
# Shatin, N.T., Hong Kong, China, December 2000, proceedings 17--22.
|
57
|
+
# (Lecture Notes in Computer Science 1983, Kwong Sak Leung, Lai-Wan
|
58
|
+
# Chan, Helen Meng (Eds.), Springer, 17--22, 2000)
|
59
|
+
# http://www.rd.dnc.ac.jp/%7Etunenori/doc/xmeans_ideal2000.pdf
|
60
|
+
#
|
61
|
+
# Examples:
|
62
|
+
#
|
63
|
+
# xmeans(iris[,-5], merge.cls=T)
|
64
|
+
# plot(cmdscale(dist(iris[,-5])), cex=2, pch=as.numeric(iris[,5]))
|
65
|
+
#
|
66
|
+
|
67
|
+
xmeans <- function(x, ik = 2, iter.max = 10, pr.proc = F, ignore.covar = T, merge.cls = F){
|
68
|
+
if (ik < 2)
|
69
|
+
ik <- 2
|
70
|
+
x <- as.matrix(x)
|
71
|
+
p <- ncol(x) # p-dimensional multivariate
|
72
|
+
if (ignore.covar){
|
73
|
+
q <- 2 * p # number of parameters; mean and var for each "p"
|
74
|
+
}else{
|
75
|
+
q <- p * (p+3) / 2 # integer
|
76
|
+
}
|
77
|
+
cl<- kmeans(x,ik,iter.max)
|
78
|
+
cl.sub <- list()
|
79
|
+
|
80
|
+
for (i in 1:ik){ # for each cluster
|
81
|
+
y.ok <- (cl$cluster == i) # i-th cluster or not
|
82
|
+
yi <- matrix(x[y.ok], ncol=p) # extract i-th cluster
|
83
|
+
zi <- yi # save the data for graphics
|
84
|
+
yi.centers <- cl$centers[i,]
|
85
|
+
zi.centers <- yi.centers
|
86
|
+
yi.cluster <- cl$cluster[(cl$cluster == i)]
|
87
|
+
yi.cluster <- rep(1, length(yi.cluster))
|
88
|
+
# sub-cluster number should begin from 1
|
89
|
+
|
90
|
+
k1 <- 1 # cluster number
|
91
|
+
k2 <- k1 + 1
|
92
|
+
bic.prior <- NULL
|
93
|
+
stack <- list() # divided and unproceeded data are stacked
|
94
|
+
lnL0 <- lnL(yi, yi.centers, ignore.covar)
|
95
|
+
yi.lnL <- lnL0$lnL
|
96
|
+
yi.detVx <- lnL0$detVx
|
97
|
+
|
98
|
+
repeat{
|
99
|
+
|
100
|
+
# go through at least 1 time;
|
101
|
+
# y$subcluster exist...
|
102
|
+
if (pr.proc) cat (paste("k1 =", k1, ", k2 =", k2,"\n"))
|
103
|
+
if (nrow(yi) == 1){ # sample size is 1
|
104
|
+
break
|
105
|
+
}
|
106
|
+
y <- split2cls(yi, yi.centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar)
|
107
|
+
if (y$continue){ # splitting continue
|
108
|
+
yi.cluster <-
|
109
|
+
updtCrusterNum(y$continue, yi.cluster, k1, k2, y$subcluster)
|
110
|
+
zi.centers <-
|
111
|
+
updtCenters(y$continue, zi.centers, k1, k2, y$centers)
|
112
|
+
yi.lnL <-
|
113
|
+
updtlnL(y$continue, yi.lnL, k1, k2, y$lnL.post)
|
114
|
+
yi.detVx <-
|
115
|
+
updtdetVx(y$continue, yi.detVx, k1, k2, y$detVx.post)
|
116
|
+
}
|
117
|
+
|
118
|
+
if (pr.proc) print(y$subcluster)
|
119
|
+
if (pr.proc){ print(y$bic.prior)
|
120
|
+
print(y$bic.post)
|
121
|
+
# print(y$lnL.prior) # for debug
|
122
|
+
# print(y$lnL.post) # for debug
|
123
|
+
print(y$continue) }
|
124
|
+
# cat("zi.centers=\n") # for debug
|
125
|
+
# print(zi.centers) # for debug
|
126
|
+
if (!y$continue){ # no-node
|
127
|
+
if ((nstack <- length(stack))){ # there are stacked data
|
128
|
+
# extract the stacked data
|
129
|
+
if (pr.proc)
|
130
|
+
cat(paste("extract the stacked data (", nstack, ")...\n"))
|
131
|
+
yi <- stack[[nstack]]$data
|
132
|
+
yi.centers <- stack[[nstack]]$centers
|
133
|
+
bic.prior <- stack[[nstack]]$bic
|
134
|
+
lnL.prior <- stack[[nstack]]$lnL
|
135
|
+
detVx.prior <- stack[[nstack]]$detVx
|
136
|
+
k1 <- stack[[nstack]]$cls
|
137
|
+
k2 <- k2 # unchanged
|
138
|
+
# delete the data set
|
139
|
+
if (nstack > 1){
|
140
|
+
stack <- stack[1:(nstack-1)]
|
141
|
+
}else{
|
142
|
+
stack <- list() # no stacked data
|
143
|
+
}
|
144
|
+
next;
|
145
|
+
}
|
146
|
+
# no node and no stack
|
147
|
+
if (pr.proc) cat ("no node and no stack...\n")
|
148
|
+
break;
|
149
|
+
}
|
150
|
+
# splitting continues...
|
151
|
+
y1 <- y$clj1 # data
|
152
|
+
y2 <- y$clj2
|
153
|
+
yi.ctr1 <- y$centers[1,] # centers
|
154
|
+
yi.ctr2 <- y$centers[2,]
|
155
|
+
bic.prior1 <- y$bic.post[1] # bic
|
156
|
+
bic.prior2 <- y$bic.post[2]
|
157
|
+
lnL.prior1 <- y$lnL.post[1] # lnL
|
158
|
+
lnL.prior2 <- y$lnL.post[2]
|
159
|
+
detVx.prior1 <- y$detVx.post[1] # detVx
|
160
|
+
detVx.prior2 <- y$detVx.post[2]
|
161
|
+
|
162
|
+
# one-hand repeats recursively...
|
163
|
+
yi <- y1
|
164
|
+
yi.centers <- yi.ctr1
|
165
|
+
bic.prior <- bic.prior1
|
166
|
+
lnL.prior <- lnL.prior1
|
167
|
+
detVx.prior <- detVx.prior1
|
168
|
+
# other-hand is stacked...
|
169
|
+
if (pr.proc) cat ("stacking ...\n")
|
170
|
+
stack <- c(stack,
|
171
|
+
list(list(data=y2, centers=yi.ctr2,
|
172
|
+
bic=bic.prior2, lnL=lnL.prior2, detVx=detVx.prior2, cls=k2)))
|
173
|
+
# inclement the cluster number
|
174
|
+
k2 <- k2 + 1
|
175
|
+
|
176
|
+
} # end of repeat
|
177
|
+
|
178
|
+
# splitting done ...
|
179
|
+
if (pr.proc){
|
180
|
+
cat ("splitting done...\n")
|
181
|
+
cat (paste("main cluster =",i,"*******\n"))
|
182
|
+
}
|
183
|
+
cl.sub <- c(cl.sub, list(list(cluster = yi.cluster,
|
184
|
+
centers = zi.centers, lnL = yi.lnL, detVx = yi.detVx,
|
185
|
+
size = tabulate(yi.cluster))))
|
186
|
+
if (pr.proc){
|
187
|
+
print(cl.sub[[i]])
|
188
|
+
plot(zi, col=yi.cluster)
|
189
|
+
if (is.vector(zi.centers))
|
190
|
+
points(zi.centers[1], zi.centers[2], pch=8)
|
191
|
+
else # array
|
192
|
+
points(zi.centers,col=1:(length(zi.centers)/p),pch=8)
|
193
|
+
}
|
194
|
+
}
|
195
|
+
if (pr.proc) print(cl.sub)
|
196
|
+
xcl <- mergeResult(cl, cl.sub, ik)
|
197
|
+
|
198
|
+
if (merge.cls == F) {
|
199
|
+
return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size))
|
200
|
+
}
|
201
|
+
|
202
|
+
# merge after progressive dividing
|
203
|
+
#
|
204
|
+
if (pr.proc) cat("merging after progressive dividing ...\n")
|
205
|
+
|
206
|
+
k <- length(xcl$size) # final cluster number
|
207
|
+
if (k <= 2){ # minimum cluster number should be 2
|
208
|
+
if (pr.proc) cat("merging skipped ...\n")
|
209
|
+
return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size))
|
210
|
+
}
|
211
|
+
if (pr.proc){
|
212
|
+
cat("xcl$detVx=")
|
213
|
+
print(xcl$detVx)
|
214
|
+
cat("xcl$size=")
|
215
|
+
print(xcl$size)
|
216
|
+
}
|
217
|
+
|
218
|
+
klist <- sort.list(xcl$size) # "small" to "large" order of xcl$detVx list
|
219
|
+
if (pr.proc) print(klist)
|
220
|
+
for (i in 1:(k-1)){
|
221
|
+
for (j in (i+1):k){
|
222
|
+
k1 = klist[i]
|
223
|
+
k2 = klist[j]
|
224
|
+
if (pr.proc) cat(paste("inspecting the clusters", k1,"and", k2,"\n"))
|
225
|
+
|
226
|
+
z <- mergedBIC(x, xcl, k1, k2, q, ignore.covar, pr.proc)
|
227
|
+
if (z$ret == F){
|
228
|
+
# k1 or k2 has been merged.
|
229
|
+
# skip this roop
|
230
|
+
if (pr.proc) cat("skipping... k1=", k1, "k2=", k2,"\n")
|
231
|
+
next
|
232
|
+
}
|
233
|
+
if (z$bicdiv > z$bicmgd){
|
234
|
+
# we prefer merged model.
|
235
|
+
# replace larger cls. number to smaller cls. number
|
236
|
+
if (pr.proc) cat("replace cls.", k2, "to", k1,"\n")
|
237
|
+
xcl$cluster <- replace(xcl$cluster, (xcl$cluster == k2), k1)
|
238
|
+
xcl$size[k1] <- xcl$size[k1] + xcl$size[k2]
|
239
|
+
xcl$size[k2] <- 0
|
240
|
+
xcl$lnL[k1] <- z$lnLmgd
|
241
|
+
xcl$lnL[k2] <- 0
|
242
|
+
xcl$detVx[k1] <- z$detVxmgd
|
243
|
+
xcl$detVx[k2] <- 0
|
244
|
+
xcl$centers[k1,] <- z$ctrmgd
|
245
|
+
xcl$centers[k2,] <- 0
|
246
|
+
|
247
|
+
}
|
248
|
+
}
|
249
|
+
}
|
250
|
+
list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size)
|
251
|
+
}
|
252
|
+
|
253
|
+
|
254
|
+
|
255
|
+
|
256
|
+
# marge the result of sub-clustering;
|
257
|
+
# cluster numbers by first kmeans should be renumbered;
|
258
|
+
# the other centers and sizes are simply added.
|
259
|
+
# cl: the result of first kmeans
|
260
|
+
# cl.sub: the result of subclustering
|
261
|
+
# ik: cluster number adopted to kmeans.
|
262
|
+
mergeResult <- function(cl, cl.sub, ik){
|
263
|
+
cluster <- cl$cluster # main cluster
|
264
|
+
centers <- NULL
|
265
|
+
size <- NULL
|
266
|
+
lnL <- NULL
|
267
|
+
detVx <- NULL
|
268
|
+
|
269
|
+
k <- 0 # uniq cluster numbers; k should be decremental.
|
270
|
+
for (i in 1:ik)
|
271
|
+
k <- k + length(cl.sub[[i]]$size)
|
272
|
+
kk <- k
|
273
|
+
|
274
|
+
for (i in ik:1){ # loop for main clusters obtained by kmeans
|
275
|
+
xsub <- cl.sub[[i]]$cluster
|
276
|
+
iki <- ik -i +1
|
277
|
+
centers <- rbind(centers, cl.sub[[iki]]$centers)
|
278
|
+
size <- c(size, cl.sub[[iki]]$size)
|
279
|
+
lnL <- c(lnL, cl.sub[[iki]]$lnL)
|
280
|
+
detVx <- c(detVx, cl.sub[[iki]]$detVx)
|
281
|
+
|
282
|
+
for (j in length(cl.sub[[i]]$size):1){ # loop for subclusters
|
283
|
+
xsub <- replace(xsub, (xsub == j), k)
|
284
|
+
k <- k -1
|
285
|
+
}
|
286
|
+
cluster <- replace(cluster, (cluster == i), xsub)
|
287
|
+
}
|
288
|
+
if (k != 0) stop("mergeResult: assertion failed (k = 0)...")
|
289
|
+
dimnames(centers) <- list(1:kk, NULL)
|
290
|
+
list(cluster = cluster, centers = centers, lnL = lnL, detVx = detVx, size = size)
|
291
|
+
}
|
292
|
+
|
293
|
+
|
294
|
+
# update the cluster number by using the result of "split2cls()"
|
295
|
+
# continue: no splitting
|
296
|
+
# v: cluster numbers vector for initial cluster.
|
297
|
+
# k1: cluster numbers should be updated; "k1" becomes "k1" and "k2"
|
298
|
+
# xsub: sub-cluster numbers vector of "v" whose value is "k1";
|
299
|
+
# given "xsub" have 1 or 2.
|
300
|
+
updtCrusterNum <- function(continue, v, k1, k2, xsub){
|
301
|
+
if (!is.vector(v))
|
302
|
+
return(xsub)
|
303
|
+
if (!continue)
|
304
|
+
return(v)
|
305
|
+
if (k1 == k2)
|
306
|
+
stop("updtCrusterNum() : k1 and k2 should differ.")
|
307
|
+
|
308
|
+
# below is same algorithm; explicit array operation is slow in R.
|
309
|
+
# j <- 1
|
310
|
+
# for (i in 1:length(v)){
|
311
|
+
# if (v[i] == k1){
|
312
|
+
# if (xsub[j] == 2)
|
313
|
+
# v[i] <- k2
|
314
|
+
# j <- j + 1
|
315
|
+
# }
|
316
|
+
# }
|
317
|
+
# end of algorithm
|
318
|
+
xsub <- replace(xsub, (xsub == 2), k2) # changed
|
319
|
+
xsub <- replace(xsub, (xsub == 1), k1) # unchanged
|
320
|
+
v <- replace(v, (v == k1), xsub)
|
321
|
+
}
|
322
|
+
|
323
|
+
|
324
|
+
# update the cluster centers by using the result of "split2cls()"
|
325
|
+
# continue: no update
|
326
|
+
# org.centers: original centers matrix
|
327
|
+
# divided.centers: divided centers matrix; it has 2 rows.
|
328
|
+
updtCenters <- function(continue, org.centers, k1, k2, divided.centers){
|
329
|
+
if (!is.matrix(org.centers))
|
330
|
+
return(divided.centers)
|
331
|
+
if (!continue)
|
332
|
+
return(org.centers)
|
333
|
+
if (k1 == k2)
|
334
|
+
stop("updtCenters() : k1 and k2 should differ.")
|
335
|
+
|
336
|
+
z <- NULL
|
337
|
+
for (i in 1:max(k2, nrow(org.centers))){
|
338
|
+
if (i == k1)
|
339
|
+
z <- rbind(z, divided.centers[1,])
|
340
|
+
else if (i == k2)
|
341
|
+
z <- rbind(z, divided.centers[2,])
|
342
|
+
else
|
343
|
+
z <- rbind(z, org.centers[i,])
|
344
|
+
}
|
345
|
+
z
|
346
|
+
}
|
347
|
+
|
348
|
+
# update the lnL by using the result of "split2cls()"
|
349
|
+
# continue: no update
|
350
|
+
# org.lnL: original lnL vector
|
351
|
+
# divided.lnL: divided lnL vector having 2 elements.
|
352
|
+
updtlnL <- function(continue, org.lnL, k1, k2, divided.lnL){
|
353
|
+
if (!is.vector(org.lnL))
|
354
|
+
return(divided.lnL)
|
355
|
+
if (!continue)
|
356
|
+
return(org.lnL)
|
357
|
+
if (k1 == k2)
|
358
|
+
stop("updtlnL() : k1 and k2 should differ.")
|
359
|
+
|
360
|
+
z <- NULL
|
361
|
+
for (i in 1:max(k2, length(org.lnL))){
|
362
|
+
if (i == k1)
|
363
|
+
z <- c(z, divided.lnL[1])
|
364
|
+
else if (i == k2)
|
365
|
+
z <- c(z, divided.lnL[2])
|
366
|
+
else
|
367
|
+
z <- c(z, org.lnL[i])
|
368
|
+
}
|
369
|
+
z
|
370
|
+
}
|
371
|
+
|
372
|
+
# update the detVx by using the result of "split2cls()"
|
373
|
+
# continue: no update
|
374
|
+
# org.detVx: original detVx vector
|
375
|
+
# divided.detVx: divided detVx vector having 2 elements.
|
376
|
+
updtdetVx <- function(continue, org.detVx, k1, k2, divided.detVx){
|
377
|
+
if (!is.vector(org.detVx))
|
378
|
+
return(divided.detVx)
|
379
|
+
if (!continue)
|
380
|
+
return(org.detVx)
|
381
|
+
if (k1 == k2)
|
382
|
+
stop("updtdetVx() : k1 and k2 should differ.")
|
383
|
+
|
384
|
+
z <- NULL
|
385
|
+
for (i in 1:max(k2, length(org.detVx))){
|
386
|
+
if (i == k1)
|
387
|
+
z <- c(z, divided.detVx[1])
|
388
|
+
else if (i == k2)
|
389
|
+
z <- c(z, divided.detVx[2])
|
390
|
+
else
|
391
|
+
z <- c(z, org.detVx[i])
|
392
|
+
}
|
393
|
+
z
|
394
|
+
}
|
395
|
+
|
396
|
+
# split 2 clusters if we would prefer it based on BIC
|
397
|
+
# q: a number of parameters
|
398
|
+
# bic.prior: BIC which x is given; if bic.prior=NULL then we calculate
|
399
|
+
# lnL.prior: lnL which x is given; if bic.prior=NULL then we calculate
|
400
|
+
# detVx.prior: detVx which x is given; if bic.prior=NULL then we calculate
|
401
|
+
split2cls <- function(x, centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar){
|
402
|
+
if (is.null(bic.prior)){
|
403
|
+
pb <- priorBIC(x, centers, q, ignore.covar)
|
404
|
+
bic.prior <- pb$bic
|
405
|
+
lnL.prior <- pb$lnL
|
406
|
+
detVx.prior <- pb$detVx
|
407
|
+
}
|
408
|
+
bic.post <- postBICs(x, centers, q, iter.max, ignore.covar)
|
409
|
+
|
410
|
+
subcluster <- bic.post$clsub$cluster
|
411
|
+
#
|
412
|
+
# compare whether if we should split
|
413
|
+
if (is.na(bic.post$bic[3])){
|
414
|
+
# BIC may has NA because of few data
|
415
|
+
continue <- FALSE
|
416
|
+
}else if (bic.post$bic[3] < bic.prior){
|
417
|
+
# splitting ...
|
418
|
+
# replace the cluster number to cl$cluster
|
419
|
+
continue <- TRUE
|
420
|
+
}else{
|
421
|
+
# not splitting...
|
422
|
+
# return "subcluster" stored k1
|
423
|
+
continue <- FALSE
|
424
|
+
}
|
425
|
+
# note that "subcluster" gives 1 or 2
|
426
|
+
list(continue = continue, subcluster = subcluster,
|
427
|
+
bic.prior = bic.prior, bic.post = bic.post$bic,
|
428
|
+
lnL.prior = lnL.prior, lnL.post = bic.post$lnL,
|
429
|
+
detVx.prior = detVx.prior, detVx.post = bic.post$detVx,
|
430
|
+
centers = bic.post$clsub$centers,
|
431
|
+
clj1 = bic.post$clj1, clj2 = bic.post$clj2)
|
432
|
+
}
|
433
|
+
|
434
|
+
|
435
|
+
|
436
|
+
|
437
|
+
# return BIC (prior BIC)
|
438
|
+
priorBIC <- function(x, centers, q, ignore.covar){
|
439
|
+
lnL0 <- lnL(x, centers, ignore.covar)
|
440
|
+
bic <- -2 * lnL0$lnL + q * log(nrow(x)) # BIC
|
441
|
+
# bic <- -2 * lnL0$lnL + q # AIC
|
442
|
+
list(lnL = lnL0$lnL, detVx = lnL0$detVx, bic = bic)
|
443
|
+
}
|
444
|
+
|
445
|
+
|
446
|
+
# return BICs (two posterior BICs)
|
447
|
+
postBICs <- function(x, centers, q, iter.max, ignore.covar){
|
448
|
+
#
|
449
|
+
# split to 2 clusters
|
450
|
+
clsub <- kmeans(x, 2, iter.max)
|
451
|
+
y.ok1 <- lapply(clsub$cluster, "==", 1) # 1st sub-cluster or not
|
452
|
+
y.ok2 <- lapply(clsub$cluster, "==", 2) # 2nd sub-cluster or not
|
453
|
+
# extract sub data
|
454
|
+
p <- ncol(x)
|
455
|
+
clj1 <- matrix(x[as.logical(y.ok1)], ncol=p)
|
456
|
+
clj2 <- matrix(x[as.logical(y.ok2)], ncol=p)
|
457
|
+
# ratio for pdf.
|
458
|
+
r1 <- clsub$size[1] / sum(clsub$size) # [0,1]
|
459
|
+
r2 <- 1 - r1 # [0,1]
|
460
|
+
# two later BICs
|
461
|
+
# print(clsub$centers[1,]) # for debug
|
462
|
+
# print(apply(clj1,2,mean)) # for debug
|
463
|
+
# print(sqrt(apply(clj1,2,var))) # for debug
|
464
|
+
# print(r1) # for debug
|
465
|
+
lnL1 <- lnL(clj1, clsub$centers[1,], ignore.covar)
|
466
|
+
# print(clsub$centers[2,]) # for debug
|
467
|
+
# print(apply(clj2,2,mean)) # for debug
|
468
|
+
# print(sqrt(apply(clj2,2,var))) # for debug
|
469
|
+
# print(r2) # for debug
|
470
|
+
lnL2 <- lnL(clj2, clsub$centers[2,], ignore.covar)
|
471
|
+
n1 <- nrow(clj1)
|
472
|
+
n2 <- nrow(clj2)
|
473
|
+
# normalizing factor; dist() is in library(mva)
|
474
|
+
if (is.na(lnL1$detVx) || is.na(lnL2$detVx))
|
475
|
+
beta <- 0
|
476
|
+
else
|
477
|
+
beta <- dist(clsub$center) / (sqrt(lnL1$detVx + lnL2$detVx))
|
478
|
+
alpha <- 0.5 / pnorm(beta)
|
479
|
+
BIC1 <- -2 * lnL1$lnL +q * log(n1)
|
480
|
+
BIC2 <- -2 * lnL2$lnL +q * log(n2)
|
481
|
+
# BIC1 <- -2 * lnL1$lnL +q # AIC
|
482
|
+
# BIC2 <- -2 * lnL2$lnL +q # AIC
|
483
|
+
|
484
|
+
# cat (paste("alpha =",alpha,"\n")) # for debug
|
485
|
+
# cat (paste("beta =",beta,"\n")) # for debug
|
486
|
+
|
487
|
+
# BIC is not (BIC1 + BIC2)
|
488
|
+
BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha)
|
489
|
+
# BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC
|
490
|
+
list(bic = c(BIC1, BIC2, BIC),
|
491
|
+
lnL = c(lnL1$lnL, lnL2$lnL),
|
492
|
+
detVx = c(lnL1$detVx, lnL2$detVx),
|
493
|
+
clsub = clsub, clj1 = clj1, clj2 = clj2)
|
494
|
+
}
|
495
|
+
|
496
|
+
|
497
|
+
|
498
|
+
# return BICs for Two-merged clusters model and devided clusters model
|
499
|
+
# k1/k2: marged cluster ID
|
500
|
+
mergedBIC <- function(x, xcl, k1, k2, q, ignore.covar, pr.proc){
|
501
|
+
# sample size
|
502
|
+
# check for input data
|
503
|
+
n1 <- xcl$size[k1]
|
504
|
+
n2 <- xcl$size[k2]
|
505
|
+
if (n1 == 0 || n2 == 0){
|
506
|
+
# already had been merged
|
507
|
+
cat(paste("already had been merged\n"))
|
508
|
+
ret <- F
|
509
|
+
return( list (ret = ret))
|
510
|
+
}
|
511
|
+
if (is.null(xcl$lnL[k1]) || is.null(xcl$lnL[k2])){
|
512
|
+
# lnL may be null because of few data
|
513
|
+
cat(paste("lnL may be null because of few data\n"))
|
514
|
+
ret <- F
|
515
|
+
return( list (ret = ret))
|
516
|
+
}
|
517
|
+
|
518
|
+
# divided clusters model
|
519
|
+
lnL1 = xcl$lnL[k1]
|
520
|
+
lnL2 = xcl$lnL[k2]
|
521
|
+
ctrextrt <- rbind(xcl$centers[k1,], xcl$centers[k2,])
|
522
|
+
beta <- dist(ctrextrt) / (sqrt(xcl$detVx[k1] + xcl$detVx[k2]))
|
523
|
+
if (pr.proc) cat(paste("beta=", round (beta, digit=2), "\n"))
|
524
|
+
|
525
|
+
# if (beta > 10){
|
526
|
+
# # 2 clusters far apart
|
527
|
+
# ret <- F
|
528
|
+
# return( list (ret = ret))
|
529
|
+
# }
|
530
|
+
|
531
|
+
alpha <- 0.5 / as.numeric(pnorm(beta))
|
532
|
+
bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha)
|
533
|
+
# bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC
|
534
|
+
|
535
|
+
# extract 2 clusters data
|
536
|
+
y.ok1 <- lapply(xcl$cluster, "==", k1) # 1st sub-cluster or not
|
537
|
+
y.ok2 <- lapply(xcl$cluster, "==", k2) # 2nd sub-cluster or not
|
538
|
+
|
539
|
+
# extract sub data
|
540
|
+
p = ncol(x)
|
541
|
+
clj1 <- matrix(x[as.logical(y.ok1)], ncol=p)
|
542
|
+
clj2 <- matrix(x[as.logical(y.ok2)], ncol=p)
|
543
|
+
xmgd <- rbind(clj1, clj2)
|
544
|
+
|
545
|
+
# merged cluster center
|
546
|
+
ctrmgd <- (n1 * xcl$centers[k1,] + n2 * xcl$centers[k2,]) / (n1 + n2)
|
547
|
+
lnLmgd <- lnL(xmgd, ctrmgd, ignore.covar)
|
548
|
+
bicmgd <- -2 * lnLmgd$lnL + q * log(nrow(xmgd)) # BIC
|
549
|
+
# bicmgd <- -2 * lnLmgd$lnL + q # AIC
|
550
|
+
|
551
|
+
ret <- T
|
552
|
+
list (ret = ret, ctrmgd = ctrmgd, lnLmgd = lnLmgd$lnL, detVxmgd = lnLmgd$detVx, bicmgd = bicmgd, bicdiv = bicdiv)
|
553
|
+
}
|
554
|
+
|
555
|
+
|
556
|
+
|
557
|
+
|
558
|
+
|
559
|
+
# log-likelihood under the assumption of
|
560
|
+
# p-dimensional multivariate normal distribution.
|
561
|
+
# ignore.covar: ignore the covariance
|
562
|
+
lnL <- function(x, centers, ignore.covar=T){
|
563
|
+
x <- as.matrix(x)
|
564
|
+
p <- ncol(x) # p-dimensional multivariate
|
565
|
+
n <- nrow(x) # sample size
|
566
|
+
if (missing(centers))
|
567
|
+
stop("centers must be a number or a matrix")
|
568
|
+
if (n <= 2) # few data
|
569
|
+
return(list(lnL=NA, detVx=NA))
|
570
|
+
vx <- var(x) # var-co.var matrix
|
571
|
+
# print(x) # for debug
|
572
|
+
if (p == 1){ # x is vector
|
573
|
+
invVx <- 1 / as.vector(vx)
|
574
|
+
detVx <- as.vector(vx)
|
575
|
+
}else{
|
576
|
+
if (ignore.covar){
|
577
|
+
invVx <- diag(1/diag(vx)) # inv. matrix when assuming diag.
|
578
|
+
detVx <- prod(diag(vx)) # det. when assuming diag.
|
579
|
+
}else{
|
580
|
+
invVx <- solve(vx) # inverse matrix of "vx"
|
581
|
+
y <- chol(vx) # Cholesky decomposition
|
582
|
+
detVx <- prod(diag(y)) # vx = t(y) %*% y, where y is triangular,
|
583
|
+
# then, det(vx) = det(t(y)) * det(y)
|
584
|
+
}
|
585
|
+
}
|
586
|
+
t1 <- -p/2 * 1.837877066 # 1.837... = log(2 * 3.1415...)
|
587
|
+
t2 <- -log(detVx) / 2
|
588
|
+
xmu <- t(apply(x, 1, "-", centers))
|
589
|
+
# print(centers) # for debug
|
590
|
+
# print(xmu) # for debug
|
591
|
+
# s <- 0
|
592
|
+
# for (i in 1:n)
|
593
|
+
# s <- s + t(xmu[i,]) %*% invVx %*% xmu[i,]
|
594
|
+
if (p == 1){
|
595
|
+
s <- sum(xmu^2 * invVx)
|
596
|
+
}else{
|
597
|
+
s <- sum(apply(xmu, 1, txInvVxX, invVx=invVx))
|
598
|
+
}
|
599
|
+
t3 <- -s / 2
|
600
|
+
ll <- (t1 + t2) * n + as.numeric(t3) # log likelihood
|
601
|
+
list(lnL=ll, detVx=detVx)
|
602
|
+
}
|
603
|
+
|
604
|
+
# function for calculation of
|
605
|
+
# t(xmu[i,]) %*% invVx %*% xmu[i,]
|
606
|
+
txInvVxX <- function(x, invVx){
|
607
|
+
t(x) %*% invVx %*% x
|
608
|
+
}
|