miga-base 0.2.2.1 → 0.2.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env Rscript
2
+ #
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+ #
6
+
7
+ #= Load stuff
8
+ argv <- commandArgs(trailingOnly=T)
9
+ suppressPackageStartupMessages(library(ape))
10
+ suppressPackageStartupMessages(library(vegan))
11
+ suppressPackageStartupMessages(library(cluster))
12
+ suppressPackageStartupMessages(library(parallel))
13
+ suppressPackageStartupMessages(library(enveomics.R))
14
+
15
+ #= Main function
16
+ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
17
+ say("==> Out base:", out_base, "<==")
18
+
19
+ # Input arguments
20
+ if(missing(ani_file)){
21
+ a <- as.data.frame(ani)
22
+ }else{
23
+ a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
24
+ }
25
+ if(nrow(a)==0){
26
+ generate_empty_files(out_base)
27
+ return(NULL)
28
+ }
29
+
30
+ # Get ANI distances
31
+ say("Distances")
32
+ a$d <- 1-a$value/100
33
+ ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.2)
34
+ ani.ph <- bionj(ani.d)
35
+ express.ori <- options('expressions')$expressions
36
+ if(express.ori < ani.ph$Nnode*4){
37
+ options(expressions=min(c(5e7,ani.ph$Nnode*4)))
38
+ }
39
+ write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
40
+ options(expressions=express.ori)
41
+
42
+ # Silhouette
43
+ say("Silhouette")
44
+ k <- 2:min(length(labels(ani.d))-1, 100)
45
+ cl <- makeCluster(thr)
46
+ s <- parSapply(cl, k, function(x) {
47
+ library(cluster)
48
+ s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
49
+ c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
50
+ })
51
+ stopCluster(cl)
52
+ s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
53
+ s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
54
+ ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
55
+ top.n <- k[which.max(ds)]
56
+
57
+ # Classify genomes
58
+ say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
59
+ ani.cl <- pam(ani.d, top.n, pamonce=1)
60
+ ani.types <- ani.cl$clustering
61
+ ani.medoids <- ani.cl$medoids
62
+
63
+ # Generate graphic report
64
+ say("Graphic report")
65
+ pdf(paste(out_base, ".pdf", sep=""), 7, 12)
66
+ layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
67
+ plot_distances(ani.d)
68
+ plot_silhouette(k, s[1,], s[2,], ds, top.n)
69
+ plot_clustering(ani.cl, ani.d, ani.types)
70
+ plot_tree(ani.ph, ani.types, ani.medoids)
71
+ dev.off()
72
+
73
+ # Save results
74
+ say("Text report")
75
+ write.table(ani.medoids, paste(out_base, "medoids", sep="."),
76
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
77
+ save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
78
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
79
+ ani.d.m <- 100 - as.matrix(ani.d)*100
80
+ for(j in 1:nrow(classif)){
81
+ classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
82
+ }
83
+ write.table(classif, paste(out_base,"classif",sep="."),
84
+ quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
85
+
86
+ # Recursive search
87
+ say("Recursive search")
88
+ for(i in 1:top.n){
89
+ medoid <- ani.medoids[i]
90
+ ds_f <- names(ani.types)[ ani.types==i ]
91
+ say("Analyzing subclade", i, "with medoid:", medoid)
92
+ dir.create(paste(out_base, ".sc-", i, sep=""))
93
+ write.table(ds_f,
94
+ paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
95
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
96
+ if(length(ds_f) > 5){
97
+ a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
98
+ subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
99
+ thr=thr, ani=a_f)
100
+ }
101
+ }
102
+ }
103
+
104
+ #= Helper functions
105
+ say <- function(...) { cat("[", date(), "]", ..., "\n") }
106
+
107
+ generate_empty_files <- function(out_base) {
108
+ pdf(paste(out_base, ".pdf", sep=""), 7, 12)
109
+ plot(1, t="n", axes=F)
110
+ legend("center", "No data", bty="n")
111
+ dev.off()
112
+ file.create(paste(out_base,".1.classif",sep=""))
113
+ file.create(paste(out_base,".1.medoids",sep=""))
114
+ }
115
+
116
+ plot_silhouette <- function(k, s, ns, ds, top.n) {
117
+ # s
118
+ par(mar=c(4,5,1,5)+0.1)
119
+ plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
120
+ ylim=range(s), bty="n", xaxs="i", yaxt="n")
121
+ polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
122
+ axis(2, fg="grey60", col.axis="grey60")
123
+ mtext("Mean silhouette", side=2, line=3, col="grey60")
124
+ # ns
125
+ par(new=TRUE)
126
+ plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
127
+ ylim=range(ns), bty="n", xaxs="i")
128
+ points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
129
+ axis(4, fg="darkred", col.axis="darkred")
130
+ mtext("Negative silhouette area", side=4, line=3, col="darkred")
131
+ # ds
132
+ par(new=TRUE)
133
+ plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
134
+ ylim=range(ds), bty="n", xaxs="i")
135
+ lines(k, ds)
136
+ abline(v=top.n, lty=2)
137
+ }
138
+
139
+ plot_distances <- function(dist) {
140
+ par(mar=c(5,4,1,2)+0.1)
141
+ hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
142
+ }
143
+
144
+ plot_clustering <- function(cl, dist, types) {
145
+ par(mar=c(5,4,4,2)+0.1)
146
+ top.n <- length(cl$medoids)
147
+ col <- ggplotColours(top.n)
148
+ plot(silhouette(cl), col=col)
149
+ if(length(labels(dist))<=15){
150
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
151
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
152
+ }else{
153
+ ani.mds <- cmdscale(dist, k=4)
154
+ if(ncol(ani.mds)==4){
155
+ plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
156
+ xlab='Component 1', ylab='Component 2')
157
+ plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
158
+ xlab='Component 3', ylab='Component 4')
159
+ }else{
160
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
161
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
162
+ }
163
+ }
164
+ }
165
+
166
+ plot_tree <- function(phy, types, medoids){
167
+ layout(1)
168
+ top.n <- length(unique(types))
169
+ col <- ggplotColours(top.n)
170
+ is.medoid <- phy$tip.label %in% medoids
171
+ phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
172
+ " [", types[phy$tip.label[is.medoid]], "]", sep='')
173
+ plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
174
+ font=ifelse(is.medoid, 2, 1),
175
+ tip.color=col[types[phy$tip.label]])
176
+ }
177
+
178
+ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
179
+ if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
180
+ hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
181
+ }
182
+
183
+ #= Main
184
+ subclades(ani_file=argv[1], out_base=argv[2],
185
+ thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
186
+
data/utils/subclades.R CHANGED
@@ -30,10 +30,14 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
30
30
  # Get ANI distances
31
31
  say("Distances")
32
32
  a$d <- 1-a$value/100
33
- ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.1)
34
- ani.hc <- hclust(ani.d, method="ward.D2")
35
- ani.ph <- as.phylo(ani.hc)
36
- write.tree(as.phylo(ani.hc), paste(out_base, ".nwk", sep=""))
33
+ ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.2)
34
+ ani.ph <- bionj(ani.d)
35
+ express.ori <- options('expressions')$expressions
36
+ if(express.ori < ani.ph$Nnode*4){
37
+ options(expressions=min(c(5e7,ani.ph$Nnode*4)))
38
+ }
39
+ write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
40
+ options(expressions=express.ori)
37
41
 
38
42
  # Silhouette
39
43
  say("Silhouette")
@@ -45,11 +49,13 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
45
49
  c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
46
50
  })
47
51
  stopCluster(cl)
48
- ds <- s[1,]/s[2,]
52
+ s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
53
+ s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
54
+ ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
49
55
  top.n <- k[which.max(ds)]
50
56
 
51
57
  # Classify genomes
52
- say("Classify")
58
+ say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
53
59
  ani.cl <- pam(ani.d, top.n, pamonce=1)
54
60
  ani.types <- ani.cl$clustering
55
61
  ani.medoids <- ani.cl$medoids
@@ -57,9 +63,9 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
57
63
  # Generate graphic report
58
64
  say("Graphic report")
59
65
  pdf(paste(out_base, ".pdf", sep=""), 7, 12)
60
- layout(1:4)
66
+ layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
61
67
  plot_distances(ani.d)
62
- plot_silhouette(k, s[1,], s[2,], top.n)
68
+ plot_silhouette(k, s[1,], s[2,], ds, top.n)
63
69
  plot_clustering(ani.cl, ani.d, ani.types)
64
70
  plot_tree(ani.ph, ani.types, ani.medoids)
65
71
  dev.off()
@@ -70,8 +76,9 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
70
76
  quote=FALSE, col.names=FALSE, row.names=FALSE)
71
77
  save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
72
78
  classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
79
+ ani.d.m <- 100 - as.matrix(ani.d)*100
73
80
  for(j in 1:nrow(classif)){
74
- classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
81
+ classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
75
82
  }
76
83
  write.table(classif, paste(out_base,"classif",sep="."),
77
84
  quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
@@ -106,19 +113,26 @@ generate_empty_files <- function(out_base) {
106
113
  file.create(paste(out_base,".1.medoids",sep=""))
107
114
  }
108
115
 
109
- plot_silhouette <- function(k, s, ds, top.n) {
116
+ plot_silhouette <- function(k, s, ns, ds, top.n) {
117
+ # s
110
118
  par(mar=c(4,5,1,5)+0.1)
111
119
  plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
112
120
  ylim=range(s), bty="n", xaxs="i", yaxt="n")
113
121
  polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
114
122
  axis(2, fg="grey60", col.axis="grey60")
115
123
  mtext("Mean silhouette", side=2, line=3, col="grey60")
124
+ # ns
116
125
  par(new=TRUE)
117
126
  plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
118
- ylim=range(ds), bty="n", xaxs="i")
119
- points(k, ds, type="o", pch=16, col=rgb(1/2,0,0,3/4))
127
+ ylim=range(ns), bty="n", xaxs="i")
128
+ points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
120
129
  axis(4, fg="darkred", col.axis="darkred")
121
130
  mtext("Negative silhouette area", side=4, line=3, col="darkred")
131
+ # ds
132
+ par(new=TRUE)
133
+ plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
134
+ ylim=range(ds), bty="n", xaxs="i")
135
+ lines(k, ds)
122
136
  abline(v=top.n, lty=2)
123
137
  }
124
138
 
@@ -134,8 +148,18 @@ plot_clustering <- function(cl, dist, types) {
134
148
  plot(silhouette(cl), col=col)
135
149
  if(length(labels(dist))<=15){
136
150
  plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
151
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
137
152
  }else{
138
- clusplot(cl, dist=dist, main="", col.p=col[types], lines=0)
153
+ ani.mds <- cmdscale(dist, k=4)
154
+ if(ncol(ani.mds)==4){
155
+ plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
156
+ xlab='Component 1', ylab='Component 2')
157
+ plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
158
+ xlab='Component 3', ylab='Component 4')
159
+ }else{
160
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
161
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
162
+ }
139
163
  }
140
164
  }
141
165
 
@@ -144,6 +168,8 @@ plot_tree <- function(phy, types, medoids){
144
168
  top.n <- length(unique(types))
145
169
  col <- ggplotColours(top.n)
146
170
  is.medoid <- phy$tip.label %in% medoids
171
+ phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
172
+ " [", types[phy$tip.label[is.medoid]], "]", sep='')
147
173
  plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
148
174
  font=ifelse(is.medoid, 2, 1),
149
175
  tip.color=col[types[phy$tip.label]])
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2.1
4
+ version: 0.2.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - ~>
32
32
  - !ruby/object:Gem::Version
33
- version: '1.2'
33
+ version: 1.2.4
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ~>
39
39
  - !ruby/object:Gem::Version
40
- version: '1.2'
40
+ version: 1.2.4
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: json
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -140,6 +140,8 @@ files:
140
140
  - utils/plot-taxdist.R
141
141
  - utils/requirements.txt
142
142
  - utils/subclades-compile.rb
143
+ - utils/subclades-nj.R
144
+ - utils/subclades-pam.R
143
145
  - utils/subclades.R
144
146
  - bin/miga
145
147
  - actions/add_result.rb
@@ -154,6 +156,7 @@ files:
154
156
  - actions/index_taxonomy.rb
155
157
  - actions/list_datasets.rb
156
158
  - actions/list_files.rb
159
+ - actions/plugins.rb
157
160
  - actions/project_info.rb
158
161
  - actions/result_stats.rb
159
162
  - actions/tax_distributions.rb