miga-base 0.2.2.1 → 0.2.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env Rscript
2
+ #
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+ #
6
+
7
+ #= Load stuff
8
+ argv <- commandArgs(trailingOnly=T)
9
+ suppressPackageStartupMessages(library(ape))
10
+ suppressPackageStartupMessages(library(vegan))
11
+ suppressPackageStartupMessages(library(cluster))
12
+ suppressPackageStartupMessages(library(parallel))
13
+ suppressPackageStartupMessages(library(enveomics.R))
14
+
15
+ #= Main function
16
+ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
17
+ say("==> Out base:", out_base, "<==")
18
+
19
+ # Input arguments
20
+ if(missing(ani_file)){
21
+ a <- as.data.frame(ani)
22
+ }else{
23
+ a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
24
+ }
25
+ if(nrow(a)==0){
26
+ generate_empty_files(out_base)
27
+ return(NULL)
28
+ }
29
+
30
+ # Get ANI distances
31
+ say("Distances")
32
+ a$d <- 1-a$value/100
33
+ ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.2)
34
+ ani.ph <- bionj(ani.d)
35
+ express.ori <- options('expressions')$expressions
36
+ if(express.ori < ani.ph$Nnode*4){
37
+ options(expressions=min(c(5e7,ani.ph$Nnode*4)))
38
+ }
39
+ write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
40
+ options(expressions=express.ori)
41
+
42
+ # Silhouette
43
+ say("Silhouette")
44
+ k <- 2:min(length(labels(ani.d))-1, 100)
45
+ cl <- makeCluster(thr)
46
+ s <- parSapply(cl, k, function(x) {
47
+ library(cluster)
48
+ s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
49
+ c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
50
+ })
51
+ stopCluster(cl)
52
+ s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
53
+ s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
54
+ ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
55
+ top.n <- k[which.max(ds)]
56
+
57
+ # Classify genomes
58
+ say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
59
+ ani.cl <- pam(ani.d, top.n, pamonce=1)
60
+ ani.types <- ani.cl$clustering
61
+ ani.medoids <- ani.cl$medoids
62
+
63
+ # Generate graphic report
64
+ say("Graphic report")
65
+ pdf(paste(out_base, ".pdf", sep=""), 7, 12)
66
+ layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
67
+ plot_distances(ani.d)
68
+ plot_silhouette(k, s[1,], s[2,], ds, top.n)
69
+ plot_clustering(ani.cl, ani.d, ani.types)
70
+ plot_tree(ani.ph, ani.types, ani.medoids)
71
+ dev.off()
72
+
73
+ # Save results
74
+ say("Text report")
75
+ write.table(ani.medoids, paste(out_base, "medoids", sep="."),
76
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
77
+ save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
78
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
79
+ ani.d.m <- 100 - as.matrix(ani.d)*100
80
+ for(j in 1:nrow(classif)){
81
+ classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
82
+ }
83
+ write.table(classif, paste(out_base,"classif",sep="."),
84
+ quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
85
+
86
+ # Recursive search
87
+ say("Recursive search")
88
+ for(i in 1:top.n){
89
+ medoid <- ani.medoids[i]
90
+ ds_f <- names(ani.types)[ ani.types==i ]
91
+ say("Analyzing subclade", i, "with medoid:", medoid)
92
+ dir.create(paste(out_base, ".sc-", i, sep=""))
93
+ write.table(ds_f,
94
+ paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
95
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
96
+ if(length(ds_f) > 5){
97
+ a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
98
+ subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
99
+ thr=thr, ani=a_f)
100
+ }
101
+ }
102
+ }
103
+
104
+ #= Helper functions
105
+ say <- function(...) { cat("[", date(), "]", ..., "\n") }
106
+
107
+ generate_empty_files <- function(out_base) {
108
+ pdf(paste(out_base, ".pdf", sep=""), 7, 12)
109
+ plot(1, t="n", axes=F)
110
+ legend("center", "No data", bty="n")
111
+ dev.off()
112
+ file.create(paste(out_base,".1.classif",sep=""))
113
+ file.create(paste(out_base,".1.medoids",sep=""))
114
+ }
115
+
116
+ plot_silhouette <- function(k, s, ns, ds, top.n) {
117
+ # s
118
+ par(mar=c(4,5,1,5)+0.1)
119
+ plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
120
+ ylim=range(s), bty="n", xaxs="i", yaxt="n")
121
+ polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
122
+ axis(2, fg="grey60", col.axis="grey60")
123
+ mtext("Mean silhouette", side=2, line=3, col="grey60")
124
+ # ns
125
+ par(new=TRUE)
126
+ plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
127
+ ylim=range(ns), bty="n", xaxs="i")
128
+ points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
129
+ axis(4, fg="darkred", col.axis="darkred")
130
+ mtext("Negative silhouette area", side=4, line=3, col="darkred")
131
+ # ds
132
+ par(new=TRUE)
133
+ plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
134
+ ylim=range(ds), bty="n", xaxs="i")
135
+ lines(k, ds)
136
+ abline(v=top.n, lty=2)
137
+ }
138
+
139
+ plot_distances <- function(dist) {
140
+ par(mar=c(5,4,1,2)+0.1)
141
+ hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
142
+ }
143
+
144
+ plot_clustering <- function(cl, dist, types) {
145
+ par(mar=c(5,4,4,2)+0.1)
146
+ top.n <- length(cl$medoids)
147
+ col <- ggplotColours(top.n)
148
+ plot(silhouette(cl), col=col)
149
+ if(length(labels(dist))<=15){
150
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
151
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
152
+ }else{
153
+ ani.mds <- cmdscale(dist, k=4)
154
+ if(ncol(ani.mds)==4){
155
+ plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
156
+ xlab='Component 1', ylab='Component 2')
157
+ plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
158
+ xlab='Component 3', ylab='Component 4')
159
+ }else{
160
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
161
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
162
+ }
163
+ }
164
+ }
165
+
166
+ plot_tree <- function(phy, types, medoids){
167
+ layout(1)
168
+ top.n <- length(unique(types))
169
+ col <- ggplotColours(top.n)
170
+ is.medoid <- phy$tip.label %in% medoids
171
+ phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
172
+ " [", types[phy$tip.label[is.medoid]], "]", sep='')
173
+ plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
174
+ font=ifelse(is.medoid, 2, 1),
175
+ tip.color=col[types[phy$tip.label]])
176
+ }
177
+
178
+ ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
179
+ if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
180
+ hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
181
+ }
182
+
183
+ #= Main
184
+ subclades(ani_file=argv[1], out_base=argv[2],
185
+ thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
186
+
data/utils/subclades.R CHANGED
@@ -30,10 +30,14 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
30
30
  # Get ANI distances
31
31
  say("Distances")
32
32
  a$d <- 1-a$value/100
33
- ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.1)
34
- ani.hc <- hclust(ani.d, method="ward.D2")
35
- ani.ph <- as.phylo(ani.hc)
36
- write.tree(as.phylo(ani.hc), paste(out_base, ".nwk", sep=""))
33
+ ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.2)
34
+ ani.ph <- bionj(ani.d)
35
+ express.ori <- options('expressions')$expressions
36
+ if(express.ori < ani.ph$Nnode*4){
37
+ options(expressions=min(c(5e7,ani.ph$Nnode*4)))
38
+ }
39
+ write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
40
+ options(expressions=express.ori)
37
41
 
38
42
  # Silhouette
39
43
  say("Silhouette")
@@ -45,11 +49,13 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
45
49
  c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
46
50
  })
47
51
  stopCluster(cl)
48
- ds <- s[1,]/s[2,]
52
+ s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
53
+ s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
54
+ ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
49
55
  top.n <- k[which.max(ds)]
50
56
 
51
57
  # Classify genomes
52
- say("Classify")
58
+ say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
53
59
  ani.cl <- pam(ani.d, top.n, pamonce=1)
54
60
  ani.types <- ani.cl$clustering
55
61
  ani.medoids <- ani.cl$medoids
@@ -57,9 +63,9 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
57
63
  # Generate graphic report
58
64
  say("Graphic report")
59
65
  pdf(paste(out_base, ".pdf", sep=""), 7, 12)
60
- layout(1:4)
66
+ layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
61
67
  plot_distances(ani.d)
62
- plot_silhouette(k, s[1,], s[2,], top.n)
68
+ plot_silhouette(k, s[1,], s[2,], ds, top.n)
63
69
  plot_clustering(ani.cl, ani.d, ani.types)
64
70
  plot_tree(ani.ph, ani.types, ani.medoids)
65
71
  dev.off()
@@ -70,8 +76,9 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
70
76
  quote=FALSE, col.names=FALSE, row.names=FALSE)
71
77
  save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
72
78
  classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
79
+ ani.d.m <- 100 - as.matrix(ani.d)*100
73
80
  for(j in 1:nrow(classif)){
74
- classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
81
+ classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
75
82
  }
76
83
  write.table(classif, paste(out_base,"classif",sep="."),
77
84
  quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
@@ -106,19 +113,26 @@ generate_empty_files <- function(out_base) {
106
113
  file.create(paste(out_base,".1.medoids",sep=""))
107
114
  }
108
115
 
109
- plot_silhouette <- function(k, s, ds, top.n) {
116
+ plot_silhouette <- function(k, s, ns, ds, top.n) {
117
+ # s
110
118
  par(mar=c(4,5,1,5)+0.1)
111
119
  plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
112
120
  ylim=range(s), bty="n", xaxs="i", yaxt="n")
113
121
  polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
114
122
  axis(2, fg="grey60", col.axis="grey60")
115
123
  mtext("Mean silhouette", side=2, line=3, col="grey60")
124
+ # ns
116
125
  par(new=TRUE)
117
126
  plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
118
- ylim=range(ds), bty="n", xaxs="i")
119
- points(k, ds, type="o", pch=16, col=rgb(1/2,0,0,3/4))
127
+ ylim=range(ns), bty="n", xaxs="i")
128
+ points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
120
129
  axis(4, fg="darkred", col.axis="darkred")
121
130
  mtext("Negative silhouette area", side=4, line=3, col="darkred")
131
+ # ds
132
+ par(new=TRUE)
133
+ plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
134
+ ylim=range(ds), bty="n", xaxs="i")
135
+ lines(k, ds)
122
136
  abline(v=top.n, lty=2)
123
137
  }
124
138
 
@@ -134,8 +148,18 @@ plot_clustering <- function(cl, dist, types) {
134
148
  plot(silhouette(cl), col=col)
135
149
  if(length(labels(dist))<=15){
136
150
  plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
151
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
137
152
  }else{
138
- clusplot(cl, dist=dist, main="", col.p=col[types], lines=0)
153
+ ani.mds <- cmdscale(dist, k=4)
154
+ if(ncol(ani.mds)==4){
155
+ plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
156
+ xlab='Component 1', ylab='Component 2')
157
+ plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
158
+ xlab='Component 3', ylab='Component 4')
159
+ }else{
160
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
161
+ plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
162
+ }
139
163
  }
140
164
  }
141
165
 
@@ -144,6 +168,8 @@ plot_tree <- function(phy, types, medoids){
144
168
  top.n <- length(unique(types))
145
169
  col <- ggplotColours(top.n)
146
170
  is.medoid <- phy$tip.label %in% medoids
171
+ phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
172
+ " [", types[phy$tip.label[is.medoid]], "]", sep='')
147
173
  plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
148
174
  font=ifelse(is.medoid, 2, 1),
149
175
  tip.color=col[types[phy$tip.label]])
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2.1
4
+ version: 0.2.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
@@ -30,14 +30,14 @@ dependencies:
30
30
  requirements:
31
31
  - - ~>
32
32
  - !ruby/object:Gem::Version
33
- version: '1.2'
33
+ version: 1.2.4
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ~>
39
39
  - !ruby/object:Gem::Version
40
- version: '1.2'
40
+ version: 1.2.4
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: json
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -140,6 +140,8 @@ files:
140
140
  - utils/plot-taxdist.R
141
141
  - utils/requirements.txt
142
142
  - utils/subclades-compile.rb
143
+ - utils/subclades-nj.R
144
+ - utils/subclades-pam.R
143
145
  - utils/subclades.R
144
146
  - bin/miga
145
147
  - actions/add_result.rb
@@ -154,6 +156,7 @@ files:
154
156
  - actions/index_taxonomy.rb
155
157
  - actions/list_datasets.rb
156
158
  - actions/list_files.rb
159
+ - actions/plugins.rb
157
160
  - actions/project_info.rb
158
161
  - actions/result_stats.rb
159
162
  - actions/tax_distributions.rb