ruby-em_algorithm 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. data/Gemfile +6 -0
  2. data/Gemfile.lock +30 -0
  3. data/README.md +44 -0
  4. data/Rakefile +7 -0
  5. data/example/.ex1.rb.swp +0 -0
  6. data/example/.ex2.rb.swp +0 -0
  7. data/example/.ex3-tmp.rb.swp +0 -0
  8. data/example/.ex3.rb.swp +0 -0
  9. data/example/data/2dim-gmm-new.txt +1267 -0
  10. data/example/data/2dim-gmm-simple.txt +676 -0
  11. data/example/data/2dim-gmm-test.txt +6565 -0
  12. data/example/data/2dim-gmm-test2.txt +2782 -0
  13. data/example/data/2dim-gmm-test3.csv +1641 -0
  14. data/example/data/2dim-gmm-test3.txt +2782 -0
  15. data/example/data/2dim-gmm-test4.csv +868 -0
  16. data/example/data/2dim-gmm-test4.txt +4924 -0
  17. data/example/data/2dim-gmm-without_weight-small.txt +2401 -0
  18. data/example/data/2dim-gmm-without_weight.txt +18001 -0
  19. data/example/data/2dim-gmm.txt +1267 -0
  20. data/example/data/gmm-new.txt +10001 -0
  21. data/example/data/gmm-simple.txt +676 -0
  22. data/example/data/gmm.txt +10001 -0
  23. data/example/data/old-gmm.txt +10000 -0
  24. data/example/ex1.rb +20 -0
  25. data/example/ex1.rb~ +20 -0
  26. data/example/ex2.rb +33 -0
  27. data/example/ex2.rb~ +33 -0
  28. data/example/ex3-tmp.rb +23 -0
  29. data/example/ex3-tmp.rb~ +25 -0
  30. data/example/ex3.rb +43 -0
  31. data/example/ex3.rb~ +43 -0
  32. data/example/tools/.2dim.rb.swp +0 -0
  33. data/example/tools/2dim.rb +69 -0
  34. data/example/tools/2dim.rb~ +69 -0
  35. data/example/tools/boxmuller.rb +28 -0
  36. data/example/tools/boxmuller.rb~ +28 -0
  37. data/example/tools/conv_from_yaml.rb +8 -0
  38. data/example/tools/conv_from_yaml_to_csv.rb +8 -0
  39. data/example/tools/conv_to_yaml.rb +17 -0
  40. data/example/tools/ellipsoid.gnuplot +63 -0
  41. data/example/tools/ellipsoid.gnuplot~ +64 -0
  42. data/example/tools/histogram.rb +19 -0
  43. data/example/tools/histogram2d.rb +20 -0
  44. data/example/tools/histogram2d.rb~ +18 -0
  45. data/example/tools/kmeans.rb +34 -0
  46. data/example/tools/mean.rb +19 -0
  47. data/example/tools/table.data +4618 -0
  48. data/example/tools/tmp.txt +69632 -0
  49. data/example/tools/xmeans.R +608 -0
  50. data/example/tools/xmeans.rb +35 -0
  51. data/lib/em_algorithm/.base.rb.swp +0 -0
  52. data/lib/em_algorithm/base.rb +116 -0
  53. data/lib/em_algorithm/base.rb~ +116 -0
  54. data/lib/em_algorithm/convergence/.chi_square.rb.swp +0 -0
  55. data/lib/em_algorithm/convergence/.likelihood.rb.swp +0 -0
  56. data/lib/em_algorithm/convergence/check_method.rb +4 -0
  57. data/lib/em_algorithm/convergence/check_method.rb~ +0 -0
  58. data/lib/em_algorithm/convergence/chi_square.rb +40 -0
  59. data/lib/em_algorithm/convergence/chi_square.rb~ +40 -0
  60. data/lib/em_algorithm/convergence/likelihood.rb +35 -0
  61. data/lib/em_algorithm/convergence/likelihood.rb~ +35 -0
  62. data/lib/em_algorithm/models/.gaussian.rb.swp +0 -0
  63. data/lib/em_algorithm/models/.md_gaussian.rb.swp +0 -0
  64. data/lib/em_algorithm/models/.mixture.rb.swp +0 -0
  65. data/lib/em_algorithm/models/.model.rb.swp +0 -0
  66. data/lib/em_algorithm/models/gaussian.rb +47 -0
  67. data/lib/em_algorithm/models/gaussian.rb~ +47 -0
  68. data/lib/em_algorithm/models/md_gaussian.rb +67 -0
  69. data/lib/em_algorithm/models/md_gaussian.rb~ +67 -0
  70. data/lib/em_algorithm/models/mixture.rb +122 -0
  71. data/lib/em_algorithm/models/mixture.rb~ +122 -0
  72. data/lib/em_algorithm/models/model.rb +19 -0
  73. data/lib/em_algorithm/models/model.rb~ +19 -0
  74. data/lib/ruby-em_algorithm.rb +3 -0
  75. data/lib/ruby-em_algorithm/version.rb +3 -0
  76. data/ruby-em_algorithm.gemspec +21 -0
  77. data/spec/spec_helper.rb +9 -0
  78. metadata +178 -0
@@ -0,0 +1,608 @@
1
+ # $Id: xmeans.prog,v 1.23 2012/04/12 11:21:12 tunenori Exp tunenori $
2
+ #
3
+ # X-MEANS Clustering
4
+ #
5
+ # Description:
6
+ #
7
+ # Perform x-maens non-hierarchal clustering on a data matrix.
8
+ #
9
+ # Usage:
10
+ #
11
+ # xmeans(x, ik = 2, iter.max = 10, pr.proc = F,
12
+ # ignore.covar = T, merge.cls = F)
13
+ #
14
+ # Arguments:
15
+ # x: A numeric matrix of data, or an object that can be coerced to
16
+ # such a matrix (such as a numeric vector or a data frame with
17
+ # all numeric columns).
18
+ #
19
+ # ik: The initial number of clusters applied to kmeans().
20
+ # As xmeans calls kmeans recursively, 'ik' should be sufficient
21
+ # small.
22
+ #
23
+ # iter.max: The maximum of iterations allowed.
24
+ #
25
+ # pr.proc: logical: If 'TRUE' the system outputs the processing status.
26
+ #
27
+ # ignore.covar: logical: If 'TRUE', covariances of cluster data are
28
+ # ignored. For saving of the time, 'TRUE' is set as the defalut.
29
+ #
30
+ # merge.cls: logical: If 'TRUE', some clusters may be merged into another
31
+ # clusters after iterative division.
32
+ #
33
+ # Value:
34
+ #
35
+ # An object of class 'xmeans' which is a list with components:
36
+ #
37
+ # cluster: A vector of integers indicating the cluster to which each
38
+ # point is allocated.
39
+ #
40
+ # centers: A matrix of cluster centres.
41
+ #
42
+ # size: The number of points in each cluster. When 'merge.cls' is TRUE,
43
+ # some elements may be zero.
44
+ #
45
+ # References:
46
+ #
47
+ # Ishioka, T. (2005): "An Expansion of X-means for Automatically
48
+ # Determining the Optimal Number of Clusters," The Fourth IASTED
49
+ # International Conference on Computational Intelligence (CI 2005),
50
+ # Calgary Canada, July 4-6, pp.91-96.
51
+ # http://www.rd.dnc.ac.jp/%7Etunenori/doc/487-053.pdf
52
+ #
53
+ # Ishioka, T. (2000): ``Extended K-means with an Efficient Estimation
54
+ # of the number of Clusters,'' Intelligent Data Engineering and
55
+ # Automated Learning --- IDEAL 2000, Second International Conference,
56
+ # Shatin, N.T., Hong Kong, China, December 2000, proceedings 17--22.
57
+ # (Lecture Notes in Computer Science 1983, Kwong Sak Leung, Lai-Wan
58
+ # Chan, Helen Meng (Eds.), Springer, 17--22, 2000)
59
+ # http://www.rd.dnc.ac.jp/%7Etunenori/doc/xmeans_ideal2000.pdf
60
+ #
61
+ # Examples:
62
+ #
63
+ # xmeans(iris[,-5], merge.cls=T)
64
+ # plot(cmdscale(dist(iris[,-5])), cex=2, pch=as.numeric(iris[,5]))
65
+ #
66
+
67
+ xmeans <- function(x, ik = 2, iter.max = 10, pr.proc = F, ignore.covar = T, merge.cls = F){
68
+ if (ik < 2)
69
+ ik <- 2
70
+ x <- as.matrix(x)
71
+ p <- ncol(x) # p-dimensional multivariate
72
+ if (ignore.covar){
73
+ q <- 2 * p # number of parameters; mean and var for each "p"
74
+ }else{
75
+ q <- p * (p+3) / 2 # integer
76
+ }
77
+ cl<- kmeans(x,ik,iter.max)
78
+ cl.sub <- list()
79
+
80
+ for (i in 1:ik){ # for each cluster
81
+ y.ok <- (cl$cluster == i) # i-th cluster or not
82
+ yi <- matrix(x[y.ok], ncol=p) # extract i-th cluster
83
+ zi <- yi # save the data for graphics
84
+ yi.centers <- cl$centers[i,]
85
+ zi.centers <- yi.centers
86
+ yi.cluster <- cl$cluster[(cl$cluster == i)]
87
+ yi.cluster <- rep(1, length(yi.cluster))
88
+ # sub-cluster number should begin from 1
89
+
90
+ k1 <- 1 # cluster number
91
+ k2 <- k1 + 1
92
+ bic.prior <- NULL
93
+ stack <- list() # divided and unproceeded data are stacked
94
+ lnL0 <- lnL(yi, yi.centers, ignore.covar)
95
+ yi.lnL <- lnL0$lnL
96
+ yi.detVx <- lnL0$detVx
97
+
98
+ repeat{
99
+
100
+ # go through at least 1 time;
101
+ # y$subcluster exist...
102
+ if (pr.proc) cat (paste("k1 =", k1, ", k2 =", k2,"\n"))
103
+ if (nrow(yi) == 1){ # sample size is 1
104
+ break
105
+ }
106
+ y <- split2cls(yi, yi.centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar)
107
+ if (y$continue){ # splitting continue
108
+ yi.cluster <-
109
+ updtCrusterNum(y$continue, yi.cluster, k1, k2, y$subcluster)
110
+ zi.centers <-
111
+ updtCenters(y$continue, zi.centers, k1, k2, y$centers)
112
+ yi.lnL <-
113
+ updtlnL(y$continue, yi.lnL, k1, k2, y$lnL.post)
114
+ yi.detVx <-
115
+ updtdetVx(y$continue, yi.detVx, k1, k2, y$detVx.post)
116
+ }
117
+
118
+ if (pr.proc) print(y$subcluster)
119
+ if (pr.proc){ print(y$bic.prior)
120
+ print(y$bic.post)
121
+ # print(y$lnL.prior) # for debug
122
+ # print(y$lnL.post) # for debug
123
+ print(y$continue) }
124
+ # cat("zi.centers=\n") # for debug
125
+ # print(zi.centers) # for debug
126
+ if (!y$continue){ # no-node
127
+ if ((nstack <- length(stack))){ # there are stacked data
128
+ # extract the stacked data
129
+ if (pr.proc)
130
+ cat(paste("extract the stacked data (", nstack, ")...\n"))
131
+ yi <- stack[[nstack]]$data
132
+ yi.centers <- stack[[nstack]]$centers
133
+ bic.prior <- stack[[nstack]]$bic
134
+ lnL.prior <- stack[[nstack]]$lnL
135
+ detVx.prior <- stack[[nstack]]$detVx
136
+ k1 <- stack[[nstack]]$cls
137
+ k2 <- k2 # unchanged
138
+ # delete the data set
139
+ if (nstack > 1){
140
+ stack <- stack[1:(nstack-1)]
141
+ }else{
142
+ stack <- list() # no stacked data
143
+ }
144
+ next;
145
+ }
146
+ # no node and no stack
147
+ if (pr.proc) cat ("no node and no stack...\n")
148
+ break;
149
+ }
150
+ # splitting continues...
151
+ y1 <- y$clj1 # data
152
+ y2 <- y$clj2
153
+ yi.ctr1 <- y$centers[1,] # centers
154
+ yi.ctr2 <- y$centers[2,]
155
+ bic.prior1 <- y$bic.post[1] # bic
156
+ bic.prior2 <- y$bic.post[2]
157
+ lnL.prior1 <- y$lnL.post[1] # lnL
158
+ lnL.prior2 <- y$lnL.post[2]
159
+ detVx.prior1 <- y$detVx.post[1] # detVx
160
+ detVx.prior2 <- y$detVx.post[2]
161
+
162
+ # one-hand repeats recursively...
163
+ yi <- y1
164
+ yi.centers <- yi.ctr1
165
+ bic.prior <- bic.prior1
166
+ lnL.prior <- lnL.prior1
167
+ detVx.prior <- detVx.prior1
168
+ # other-hand is stacked...
169
+ if (pr.proc) cat ("stacking ...\n")
170
+ stack <- c(stack,
171
+ list(list(data=y2, centers=yi.ctr2,
172
+ bic=bic.prior2, lnL=lnL.prior2, detVx=detVx.prior2, cls=k2)))
173
+ # inclement the cluster number
174
+ k2 <- k2 + 1
175
+
176
+ } # end of repeat
177
+
178
+ # splitting done ...
179
+ if (pr.proc){
180
+ cat ("splitting done...\n")
181
+ cat (paste("main cluster =",i,"*******\n"))
182
+ }
183
+ cl.sub <- c(cl.sub, list(list(cluster = yi.cluster,
184
+ centers = zi.centers, lnL = yi.lnL, detVx = yi.detVx,
185
+ size = tabulate(yi.cluster))))
186
+ if (pr.proc){
187
+ print(cl.sub[[i]])
188
+ plot(zi, col=yi.cluster)
189
+ if (is.vector(zi.centers))
190
+ points(zi.centers[1], zi.centers[2], pch=8)
191
+ else # array
192
+ points(zi.centers,col=1:(length(zi.centers)/p),pch=8)
193
+ }
194
+ }
195
+ if (pr.proc) print(cl.sub)
196
+ xcl <- mergeResult(cl, cl.sub, ik)
197
+
198
+ if (merge.cls == F) {
199
+ return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size))
200
+ }
201
+
202
+ # merge after progressive dividing
203
+ #
204
+ if (pr.proc) cat("merging after progressive dividing ...\n")
205
+
206
+ k <- length(xcl$size) # final cluster number
207
+ if (k <= 2){ # minimum cluster number should be 2
208
+ if (pr.proc) cat("merging skipped ...\n")
209
+ return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size))
210
+ }
211
+ if (pr.proc){
212
+ cat("xcl$detVx=")
213
+ print(xcl$detVx)
214
+ cat("xcl$size=")
215
+ print(xcl$size)
216
+ }
217
+
218
+ klist <- sort.list(xcl$size) # "small" to "large" order of xcl$detVx list
219
+ if (pr.proc) print(klist)
220
+ for (i in 1:(k-1)){
221
+ for (j in (i+1):k){
222
+ k1 = klist[i]
223
+ k2 = klist[j]
224
+ if (pr.proc) cat(paste("inspecting the clusters", k1,"and", k2,"\n"))
225
+
226
+ z <- mergedBIC(x, xcl, k1, k2, q, ignore.covar, pr.proc)
227
+ if (z$ret == F){
228
+ # k1 or k2 has been merged.
229
+ # skip this roop
230
+ if (pr.proc) cat("skipping... k1=", k1, "k2=", k2,"\n")
231
+ next
232
+ }
233
+ if (z$bicdiv > z$bicmgd){
234
+ # we prefer merged model.
235
+ # replace larger cls. number to smaller cls. number
236
+ if (pr.proc) cat("replace cls.", k2, "to", k1,"\n")
237
+ xcl$cluster <- replace(xcl$cluster, (xcl$cluster == k2), k1)
238
+ xcl$size[k1] <- xcl$size[k1] + xcl$size[k2]
239
+ xcl$size[k2] <- 0
240
+ xcl$lnL[k1] <- z$lnLmgd
241
+ xcl$lnL[k2] <- 0
242
+ xcl$detVx[k1] <- z$detVxmgd
243
+ xcl$detVx[k2] <- 0
244
+ xcl$centers[k1,] <- z$ctrmgd
245
+ xcl$centers[k2,] <- 0
246
+
247
+ }
248
+ }
249
+ }
250
+ list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size)
251
+ }
252
+
253
+
254
+
255
+
256
+ # marge the result of sub-clustering;
257
+ # cluster numbers by first kmeans should be renumbered;
258
+ # the other centers and sizes are simply added.
259
+ # cl: the result of first kmeans
260
+ # cl.sub: the result of subclustering
261
+ # ik: cluster number adopted to kmeans.
262
+ mergeResult <- function(cl, cl.sub, ik){
263
+ cluster <- cl$cluster # main cluster
264
+ centers <- NULL
265
+ size <- NULL
266
+ lnL <- NULL
267
+ detVx <- NULL
268
+
269
+ k <- 0 # uniq cluster numbers; k should be decremental.
270
+ for (i in 1:ik)
271
+ k <- k + length(cl.sub[[i]]$size)
272
+ kk <- k
273
+
274
+ for (i in ik:1){ # loop for main clusters obtained by kmeans
275
+ xsub <- cl.sub[[i]]$cluster
276
+ iki <- ik -i +1
277
+ centers <- rbind(centers, cl.sub[[iki]]$centers)
278
+ size <- c(size, cl.sub[[iki]]$size)
279
+ lnL <- c(lnL, cl.sub[[iki]]$lnL)
280
+ detVx <- c(detVx, cl.sub[[iki]]$detVx)
281
+
282
+ for (j in length(cl.sub[[i]]$size):1){ # loop for subclusters
283
+ xsub <- replace(xsub, (xsub == j), k)
284
+ k <- k -1
285
+ }
286
+ cluster <- replace(cluster, (cluster == i), xsub)
287
+ }
288
+ if (k != 0) stop("mergeResult: assertion failed (k = 0)...")
289
+ dimnames(centers) <- list(1:kk, NULL)
290
+ list(cluster = cluster, centers = centers, lnL = lnL, detVx = detVx, size = size)
291
+ }
292
+
293
+
294
+ # update the cluster number by using the result of "split2cls()"
295
+ # continue: no splitting
296
+ # v: cluster numbers vector for initial cluster.
297
+ # k1: cluster numbers should be updated; "k1" becomes "k1" and "k2"
298
+ # xsub: sub-cluster numbers vector of "v" whose value is "k1";
299
+ # given "xsub" have 1 or 2.
300
+ updtCrusterNum <- function(continue, v, k1, k2, xsub){
301
+ if (!is.vector(v))
302
+ return(xsub)
303
+ if (!continue)
304
+ return(v)
305
+ if (k1 == k2)
306
+ stop("updtCrusterNum() : k1 and k2 should differ.")
307
+
308
+ # below is same algorithm; explicit array operation is slow in R.
309
+ # j <- 1
310
+ # for (i in 1:length(v)){
311
+ # if (v[i] == k1){
312
+ # if (xsub[j] == 2)
313
+ # v[i] <- k2
314
+ # j <- j + 1
315
+ # }
316
+ # }
317
+ # end of algorithm
318
+ xsub <- replace(xsub, (xsub == 2), k2) # changed
319
+ xsub <- replace(xsub, (xsub == 1), k1) # unchanged
320
+ v <- replace(v, (v == k1), xsub)
321
+ }
322
+
323
+
324
+ # update the cluster centers by using the result of "split2cls()"
325
+ # continue: no update
326
+ # org.centers: original centers matrix
327
+ # divided.centers: divided centers matrix; it has 2 rows.
328
+ updtCenters <- function(continue, org.centers, k1, k2, divided.centers){
329
+ if (!is.matrix(org.centers))
330
+ return(divided.centers)
331
+ if (!continue)
332
+ return(org.centers)
333
+ if (k1 == k2)
334
+ stop("updtCenters() : k1 and k2 should differ.")
335
+
336
+ z <- NULL
337
+ for (i in 1:max(k2, nrow(org.centers))){
338
+ if (i == k1)
339
+ z <- rbind(z, divided.centers[1,])
340
+ else if (i == k2)
341
+ z <- rbind(z, divided.centers[2,])
342
+ else
343
+ z <- rbind(z, org.centers[i,])
344
+ }
345
+ z
346
+ }
347
+
348
+ # update the lnL by using the result of "split2cls()"
349
+ # continue: no update
350
+ # org.lnL: original lnL vector
351
+ # divided.lnL: divided lnL vector having 2 elements.
352
+ updtlnL <- function(continue, org.lnL, k1, k2, divided.lnL){
353
+ if (!is.vector(org.lnL))
354
+ return(divided.lnL)
355
+ if (!continue)
356
+ return(org.lnL)
357
+ if (k1 == k2)
358
+ stop("updtlnL() : k1 and k2 should differ.")
359
+
360
+ z <- NULL
361
+ for (i in 1:max(k2, length(org.lnL))){
362
+ if (i == k1)
363
+ z <- c(z, divided.lnL[1])
364
+ else if (i == k2)
365
+ z <- c(z, divided.lnL[2])
366
+ else
367
+ z <- c(z, org.lnL[i])
368
+ }
369
+ z
370
+ }
371
+
372
+ # update the detVx by using the result of "split2cls()"
373
+ # continue: no update
374
+ # org.detVx: original detVx vector
375
+ # divided.detVx: divided detVx vector having 2 elements.
376
+ updtdetVx <- function(continue, org.detVx, k1, k2, divided.detVx){
377
+ if (!is.vector(org.detVx))
378
+ return(divided.detVx)
379
+ if (!continue)
380
+ return(org.detVx)
381
+ if (k1 == k2)
382
+ stop("updtdetVx() : k1 and k2 should differ.")
383
+
384
+ z <- NULL
385
+ for (i in 1:max(k2, length(org.detVx))){
386
+ if (i == k1)
387
+ z <- c(z, divided.detVx[1])
388
+ else if (i == k2)
389
+ z <- c(z, divided.detVx[2])
390
+ else
391
+ z <- c(z, org.detVx[i])
392
+ }
393
+ z
394
+ }
395
+
396
+ # split 2 clusters if we would prefer it based on BIC
397
+ # q: a number of parameters
398
+ # bic.prior: BIC which x is given; if bic.prior=NULL then we calculate
399
+ # lnL.prior: lnL which x is given; if bic.prior=NULL then we calculate
400
+ # detVx.prior: detVx which x is given; if bic.prior=NULL then we calculate
401
+ split2cls <- function(x, centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar){
402
+ if (is.null(bic.prior)){
403
+ pb <- priorBIC(x, centers, q, ignore.covar)
404
+ bic.prior <- pb$bic
405
+ lnL.prior <- pb$lnL
406
+ detVx.prior <- pb$detVx
407
+ }
408
+ bic.post <- postBICs(x, centers, q, iter.max, ignore.covar)
409
+
410
+ subcluster <- bic.post$clsub$cluster
411
+ #
412
+ # compare whether if we should split
413
+ if (is.na(bic.post$bic[3])){
414
+ # BIC may has NA because of few data
415
+ continue <- FALSE
416
+ }else if (bic.post$bic[3] < bic.prior){
417
+ # splitting ...
418
+ # replace the cluster number to cl$cluster
419
+ continue <- TRUE
420
+ }else{
421
+ # not splitting...
422
+ # return "subcluster" stored k1
423
+ continue <- FALSE
424
+ }
425
+ # note that "subcluster" gives 1 or 2
426
+ list(continue = continue, subcluster = subcluster,
427
+ bic.prior = bic.prior, bic.post = bic.post$bic,
428
+ lnL.prior = lnL.prior, lnL.post = bic.post$lnL,
429
+ detVx.prior = detVx.prior, detVx.post = bic.post$detVx,
430
+ centers = bic.post$clsub$centers,
431
+ clj1 = bic.post$clj1, clj2 = bic.post$clj2)
432
+ }
433
+
434
+
435
+
436
+
437
+ # return BIC (prior BIC)
438
+ priorBIC <- function(x, centers, q, ignore.covar){
439
+ lnL0 <- lnL(x, centers, ignore.covar)
440
+ bic <- -2 * lnL0$lnL + q * log(nrow(x)) # BIC
441
+ # bic <- -2 * lnL0$lnL + q # AIC
442
+ list(lnL = lnL0$lnL, detVx = lnL0$detVx, bic = bic)
443
+ }
444
+
445
+
446
+ # return BICs (two posterior BICs)
447
+ postBICs <- function(x, centers, q, iter.max, ignore.covar){
448
+ #
449
+ # split to 2 clusters
450
+ clsub <- kmeans(x, 2, iter.max)
451
+ y.ok1 <- lapply(clsub$cluster, "==", 1) # 1st sub-cluster or not
452
+ y.ok2 <- lapply(clsub$cluster, "==", 2) # 2nd sub-cluster or not
453
+ # extract sub data
454
+ p <- ncol(x)
455
+ clj1 <- matrix(x[as.logical(y.ok1)], ncol=p)
456
+ clj2 <- matrix(x[as.logical(y.ok2)], ncol=p)
457
+ # ratio for pdf.
458
+ r1 <- clsub$size[1] / sum(clsub$size) # [0,1]
459
+ r2 <- 1 - r1 # [0,1]
460
+ # two later BICs
461
+ # print(clsub$centers[1,]) # for debug
462
+ # print(apply(clj1,2,mean)) # for debug
463
+ # print(sqrt(apply(clj1,2,var))) # for debug
464
+ # print(r1) # for debug
465
+ lnL1 <- lnL(clj1, clsub$centers[1,], ignore.covar)
466
+ # print(clsub$centers[2,]) # for debug
467
+ # print(apply(clj2,2,mean)) # for debug
468
+ # print(sqrt(apply(clj2,2,var))) # for debug
469
+ # print(r2) # for debug
470
+ lnL2 <- lnL(clj2, clsub$centers[2,], ignore.covar)
471
+ n1 <- nrow(clj1)
472
+ n2 <- nrow(clj2)
473
+ # normalizing factor; dist() is in library(mva)
474
+ if (is.na(lnL1$detVx) || is.na(lnL2$detVx))
475
+ beta <- 0
476
+ else
477
+ beta <- dist(clsub$center) / (sqrt(lnL1$detVx + lnL2$detVx))
478
+ alpha <- 0.5 / pnorm(beta)
479
+ BIC1 <- -2 * lnL1$lnL +q * log(n1)
480
+ BIC2 <- -2 * lnL2$lnL +q * log(n2)
481
+ # BIC1 <- -2 * lnL1$lnL +q # AIC
482
+ # BIC2 <- -2 * lnL2$lnL +q # AIC
483
+
484
+ # cat (paste("alpha =",alpha,"\n")) # for debug
485
+ # cat (paste("beta =",beta,"\n")) # for debug
486
+
487
+ # BIC is not (BIC1 + BIC2)
488
+ BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha)
489
+ # BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC
490
+ list(bic = c(BIC1, BIC2, BIC),
491
+ lnL = c(lnL1$lnL, lnL2$lnL),
492
+ detVx = c(lnL1$detVx, lnL2$detVx),
493
+ clsub = clsub, clj1 = clj1, clj2 = clj2)
494
+ }
495
+
496
+
497
+
498
+ # return BICs for Two-merged clusters model and devided clusters model
499
+ # k1/k2: marged cluster ID
500
+ mergedBIC <- function(x, xcl, k1, k2, q, ignore.covar, pr.proc){
501
+ # sample size
502
+ # check for input data
503
+ n1 <- xcl$size[k1]
504
+ n2 <- xcl$size[k2]
505
+ if (n1 == 0 || n2 == 0){
506
+ # already had been merged
507
+ cat(paste("already had been merged\n"))
508
+ ret <- F
509
+ return( list (ret = ret))
510
+ }
511
+ if (is.null(xcl$lnL[k1]) || is.null(xcl$lnL[k2])){
512
+ # lnL may be null because of few data
513
+ cat(paste("lnL may be null because of few data\n"))
514
+ ret <- F
515
+ return( list (ret = ret))
516
+ }
517
+
518
+ # divided clusters model
519
+ lnL1 = xcl$lnL[k1]
520
+ lnL2 = xcl$lnL[k2]
521
+ ctrextrt <- rbind(xcl$centers[k1,], xcl$centers[k2,])
522
+ beta <- dist(ctrextrt) / (sqrt(xcl$detVx[k1] + xcl$detVx[k2]))
523
+ if (pr.proc) cat(paste("beta=", round (beta, digit=2), "\n"))
524
+
525
+ # if (beta > 10){
526
+ # # 2 clusters far apart
527
+ # ret <- F
528
+ # return( list (ret = ret))
529
+ # }
530
+
531
+ alpha <- 0.5 / as.numeric(pnorm(beta))
532
+ bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha)
533
+ # bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC
534
+
535
+ # extract 2 clusters data
536
+ y.ok1 <- lapply(xcl$cluster, "==", k1) # 1st sub-cluster or not
537
+ y.ok2 <- lapply(xcl$cluster, "==", k2) # 2nd sub-cluster or not
538
+
539
+ # extract sub data
540
+ p = ncol(x)
541
+ clj1 <- matrix(x[as.logical(y.ok1)], ncol=p)
542
+ clj2 <- matrix(x[as.logical(y.ok2)], ncol=p)
543
+ xmgd <- rbind(clj1, clj2)
544
+
545
+ # merged cluster center
546
+ ctrmgd <- (n1 * xcl$centers[k1,] + n2 * xcl$centers[k2,]) / (n1 + n2)
547
+ lnLmgd <- lnL(xmgd, ctrmgd, ignore.covar)
548
+ bicmgd <- -2 * lnLmgd$lnL + q * log(nrow(xmgd)) # BIC
549
+ # bicmgd <- -2 * lnLmgd$lnL + q # AIC
550
+
551
+ ret <- T
552
+ list (ret = ret, ctrmgd = ctrmgd, lnLmgd = lnLmgd$lnL, detVxmgd = lnLmgd$detVx, bicmgd = bicmgd, bicdiv = bicdiv)
553
+ }
554
+
555
+
556
+
557
+
558
+
559
+ # log-likelihood under the assumption of
560
+ # p-dimensional multivariate normal distribution.
561
+ # ignore.covar: ignore the covariance
562
+ lnL <- function(x, centers, ignore.covar=T){
563
+ x <- as.matrix(x)
564
+ p <- ncol(x) # p-dimensional multivariate
565
+ n <- nrow(x) # sample size
566
+ if (missing(centers))
567
+ stop("centers must be a number or a matrix")
568
+ if (n <= 2) # few data
569
+ return(list(lnL=NA, detVx=NA))
570
+ vx <- var(x) # var-co.var matrix
571
+ # print(x) # for debug
572
+ if (p == 1){ # x is vector
573
+ invVx <- 1 / as.vector(vx)
574
+ detVx <- as.vector(vx)
575
+ }else{
576
+ if (ignore.covar){
577
+ invVx <- diag(1/diag(vx)) # inv. matrix when assuming diag.
578
+ detVx <- prod(diag(vx)) # det. when assuming diag.
579
+ }else{
580
+ invVx <- solve(vx) # inverse matrix of "vx"
581
+ y <- chol(vx) # Cholesky decomposition
582
+ detVx <- prod(diag(y)) # vx = t(y) %*% y, where y is triangular,
583
+ # then, det(vx) = det(t(y)) * det(y)
584
+ }
585
+ }
586
+ t1 <- -p/2 * 1.837877066 # 1.837... = log(2 * 3.1415...)
587
+ t2 <- -log(detVx) / 2
588
+ xmu <- t(apply(x, 1, "-", centers))
589
+ # print(centers) # for debug
590
+ # print(xmu) # for debug
591
+ # s <- 0
592
+ # for (i in 1:n)
593
+ # s <- s + t(xmu[i,]) %*% invVx %*% xmu[i,]
594
+ if (p == 1){
595
+ s <- sum(xmu^2 * invVx)
596
+ }else{
597
+ s <- sum(apply(xmu, 1, txInvVxX, invVx=invVx))
598
+ }
599
+ t3 <- -s / 2
600
+ ll <- (t1 + t2) * n + as.numeric(t3) # log likelihood
601
+ list(lnL=ll, detVx=detVx)
602
+ }
603
+
604
+ # function for calculation of
605
+ # t(xmu[i,]) %*% invVx %*% xmu[i,]
606
+ txInvVxX <- function(x, invVx){
607
+ t(x) %*% invVx %*% x
608
+ }