miga-base 0.2.2.1 → 0.2.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +4 -1
- data/actions/create_dataset.rb +2 -5
- data/actions/daemon.rb +1 -0
- data/actions/plugins.rb +25 -0
- data/actions/result_stats.rb +10 -0
- data/bin/miga +1 -0
- data/lib/miga/daemon.rb +12 -4
- data/lib/miga/dataset.rb +4 -3
- data/lib/miga/project.rb +38 -4
- data/lib/miga/remote_dataset.rb +2 -2
- data/lib/miga/version.rb +1 -1
- data/scripts/_distances_functions.bash +20 -20
- data/scripts/_distances_noref_nomulti.bash +20 -13
- data/scripts/_distances_ref_nomulti.bash +11 -10
- data/scripts/aai_distances.bash +15 -12
- data/scripts/ani_distances.bash +14 -11
- data/scripts/assembly.bash +2 -1
- data/scripts/cds.bash +2 -2
- data/scripts/clade_finding.bash +2 -1
- data/scripts/distances.bash +2 -2
- data/scripts/essential_genes.bash +14 -4
- data/scripts/haai_distances.bash +17 -20
- data/scripts/init.bash +1 -1
- data/scripts/miga.bash +6 -0
- data/scripts/mytaxa.bash +2 -2
- data/scripts/mytaxa_scan.bash +2 -2
- data/scripts/ogs.bash +2 -2
- data/scripts/read_quality.bash +2 -2
- data/scripts/ssu.bash +2 -2
- data/scripts/stats.bash +3 -2
- data/scripts/subclades.bash +2 -2
- data/scripts/trimmed_fasta.bash +2 -2
- data/scripts/trimmed_reads.bash +2 -2
- data/test/daemon_test.rb +1 -1
- data/test/test_helper.rb +2 -2
- data/utils/subclades-nj.R +244 -0
- data/utils/subclades-pam.R +186 -0
- data/utils/subclades.R +39 -13
- metadata +6 -3
@@ -0,0 +1,186 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
#
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
#
|
6
|
+
|
7
|
+
#= Load stuff
|
8
|
+
argv <- commandArgs(trailingOnly=T)
|
9
|
+
suppressPackageStartupMessages(library(ape))
|
10
|
+
suppressPackageStartupMessages(library(vegan))
|
11
|
+
suppressPackageStartupMessages(library(cluster))
|
12
|
+
suppressPackageStartupMessages(library(parallel))
|
13
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
14
|
+
|
15
|
+
#= Main function
|
16
|
+
subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
17
|
+
say("==> Out base:", out_base, "<==")
|
18
|
+
|
19
|
+
# Input arguments
|
20
|
+
if(missing(ani_file)){
|
21
|
+
a <- as.data.frame(ani)
|
22
|
+
}else{
|
23
|
+
a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
|
24
|
+
}
|
25
|
+
if(nrow(a)==0){
|
26
|
+
generate_empty_files(out_base)
|
27
|
+
return(NULL)
|
28
|
+
}
|
29
|
+
|
30
|
+
# Get ANI distances
|
31
|
+
say("Distances")
|
32
|
+
a$d <- 1-a$value/100
|
33
|
+
ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.2)
|
34
|
+
ani.ph <- bionj(ani.d)
|
35
|
+
express.ori <- options('expressions')$expressions
|
36
|
+
if(express.ori < ani.ph$Nnode*4){
|
37
|
+
options(expressions=min(c(5e7,ani.ph$Nnode*4)))
|
38
|
+
}
|
39
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
|
40
|
+
options(expressions=express.ori)
|
41
|
+
|
42
|
+
# Silhouette
|
43
|
+
say("Silhouette")
|
44
|
+
k <- 2:min(length(labels(ani.d))-1, 100)
|
45
|
+
cl <- makeCluster(thr)
|
46
|
+
s <- parSapply(cl, k, function(x) {
|
47
|
+
library(cluster)
|
48
|
+
s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
|
49
|
+
c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
|
50
|
+
})
|
51
|
+
stopCluster(cl)
|
52
|
+
s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
|
53
|
+
s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
|
54
|
+
ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
|
55
|
+
top.n <- k[which.max(ds)]
|
56
|
+
|
57
|
+
# Classify genomes
|
58
|
+
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
59
|
+
ani.cl <- pam(ani.d, top.n, pamonce=1)
|
60
|
+
ani.types <- ani.cl$clustering
|
61
|
+
ani.medoids <- ani.cl$medoids
|
62
|
+
|
63
|
+
# Generate graphic report
|
64
|
+
say("Graphic report")
|
65
|
+
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
66
|
+
layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
|
67
|
+
plot_distances(ani.d)
|
68
|
+
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
69
|
+
plot_clustering(ani.cl, ani.d, ani.types)
|
70
|
+
plot_tree(ani.ph, ani.types, ani.medoids)
|
71
|
+
dev.off()
|
72
|
+
|
73
|
+
# Save results
|
74
|
+
say("Text report")
|
75
|
+
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
76
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
77
|
+
save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
|
78
|
+
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
79
|
+
ani.d.m <- 100 - as.matrix(ani.d)*100
|
80
|
+
for(j in 1:nrow(classif)){
|
81
|
+
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
82
|
+
}
|
83
|
+
write.table(classif, paste(out_base,"classif",sep="."),
|
84
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
85
|
+
|
86
|
+
# Recursive search
|
87
|
+
say("Recursive search")
|
88
|
+
for(i in 1:top.n){
|
89
|
+
medoid <- ani.medoids[i]
|
90
|
+
ds_f <- names(ani.types)[ ani.types==i ]
|
91
|
+
say("Analyzing subclade", i, "with medoid:", medoid)
|
92
|
+
dir.create(paste(out_base, ".sc-", i, sep=""))
|
93
|
+
write.table(ds_f,
|
94
|
+
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
95
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
96
|
+
if(length(ds_f) > 5){
|
97
|
+
a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
|
98
|
+
subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
99
|
+
thr=thr, ani=a_f)
|
100
|
+
}
|
101
|
+
}
|
102
|
+
}
|
103
|
+
|
104
|
+
#= Helper functions
|
105
|
+
say <- function(...) { cat("[", date(), "]", ..., "\n") }
|
106
|
+
|
107
|
+
generate_empty_files <- function(out_base) {
|
108
|
+
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
109
|
+
plot(1, t="n", axes=F)
|
110
|
+
legend("center", "No data", bty="n")
|
111
|
+
dev.off()
|
112
|
+
file.create(paste(out_base,".1.classif",sep=""))
|
113
|
+
file.create(paste(out_base,".1.medoids",sep=""))
|
114
|
+
}
|
115
|
+
|
116
|
+
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
117
|
+
# s
|
118
|
+
par(mar=c(4,5,1,5)+0.1)
|
119
|
+
plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
|
120
|
+
ylim=range(s), bty="n", xaxs="i", yaxt="n")
|
121
|
+
polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
|
122
|
+
axis(2, fg="grey60", col.axis="grey60")
|
123
|
+
mtext("Mean silhouette", side=2, line=3, col="grey60")
|
124
|
+
# ns
|
125
|
+
par(new=TRUE)
|
126
|
+
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
127
|
+
ylim=range(ns), bty="n", xaxs="i")
|
128
|
+
points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
|
129
|
+
axis(4, fg="darkred", col.axis="darkred")
|
130
|
+
mtext("Negative silhouette area", side=4, line=3, col="darkred")
|
131
|
+
# ds
|
132
|
+
par(new=TRUE)
|
133
|
+
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
134
|
+
ylim=range(ds), bty="n", xaxs="i")
|
135
|
+
lines(k, ds)
|
136
|
+
abline(v=top.n, lty=2)
|
137
|
+
}
|
138
|
+
|
139
|
+
plot_distances <- function(dist) {
|
140
|
+
par(mar=c(5,4,1,2)+0.1)
|
141
|
+
hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
|
142
|
+
}
|
143
|
+
|
144
|
+
plot_clustering <- function(cl, dist, types) {
|
145
|
+
par(mar=c(5,4,4,2)+0.1)
|
146
|
+
top.n <- length(cl$medoids)
|
147
|
+
col <- ggplotColours(top.n)
|
148
|
+
plot(silhouette(cl), col=col)
|
149
|
+
if(length(labels(dist))<=15){
|
150
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
151
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
152
|
+
}else{
|
153
|
+
ani.mds <- cmdscale(dist, k=4)
|
154
|
+
if(ncol(ani.mds)==4){
|
155
|
+
plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
|
156
|
+
xlab='Component 1', ylab='Component 2')
|
157
|
+
plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
|
158
|
+
xlab='Component 3', ylab='Component 4')
|
159
|
+
}else{
|
160
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
161
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
162
|
+
}
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
plot_tree <- function(phy, types, medoids){
|
167
|
+
layout(1)
|
168
|
+
top.n <- length(unique(types))
|
169
|
+
col <- ggplotColours(top.n)
|
170
|
+
is.medoid <- phy$tip.label %in% medoids
|
171
|
+
phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
|
172
|
+
" [", types[phy$tip.label[is.medoid]], "]", sep='')
|
173
|
+
plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
|
174
|
+
font=ifelse(is.medoid, 2, 1),
|
175
|
+
tip.color=col[types[phy$tip.label]])
|
176
|
+
}
|
177
|
+
|
178
|
+
ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
|
179
|
+
if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
|
180
|
+
hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
|
181
|
+
}
|
182
|
+
|
183
|
+
#= Main
|
184
|
+
subclades(ani_file=argv[1], out_base=argv[2],
|
185
|
+
thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
|
186
|
+
|
data/utils/subclades.R
CHANGED
@@ -30,10 +30,14 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
30
30
|
# Get ANI distances
|
31
31
|
say("Distances")
|
32
32
|
a$d <- 1-a$value/100
|
33
|
-
ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.
|
34
|
-
ani.
|
35
|
-
|
36
|
-
|
33
|
+
ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.2)
|
34
|
+
ani.ph <- bionj(ani.d)
|
35
|
+
express.ori <- options('expressions')$expressions
|
36
|
+
if(express.ori < ani.ph$Nnode*4){
|
37
|
+
options(expressions=min(c(5e7,ani.ph$Nnode*4)))
|
38
|
+
}
|
39
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
|
40
|
+
options(expressions=express.ori)
|
37
41
|
|
38
42
|
# Silhouette
|
39
43
|
say("Silhouette")
|
@@ -45,11 +49,13 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
45
49
|
c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
|
46
50
|
})
|
47
51
|
stopCluster(cl)
|
48
|
-
|
52
|
+
s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
|
53
|
+
s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
|
54
|
+
ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
|
49
55
|
top.n <- k[which.max(ds)]
|
50
56
|
|
51
57
|
# Classify genomes
|
52
|
-
say("Classify")
|
58
|
+
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
53
59
|
ani.cl <- pam(ani.d, top.n, pamonce=1)
|
54
60
|
ani.types <- ani.cl$clustering
|
55
61
|
ani.medoids <- ani.cl$medoids
|
@@ -57,9 +63,9 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
57
63
|
# Generate graphic report
|
58
64
|
say("Graphic report")
|
59
65
|
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
60
|
-
layout(1
|
66
|
+
layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
|
61
67
|
plot_distances(ani.d)
|
62
|
-
plot_silhouette(k, s[1,], s[2,], top.n)
|
68
|
+
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
63
69
|
plot_clustering(ani.cl, ani.d, ani.types)
|
64
70
|
plot_tree(ani.ph, ani.types, ani.medoids)
|
65
71
|
dev.off()
|
@@ -70,8 +76,9 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
70
76
|
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
71
77
|
save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
|
72
78
|
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
79
|
+
ani.d.m <- 100 - as.matrix(ani.d)*100
|
73
80
|
for(j in 1:nrow(classif)){
|
74
|
-
classif[j,4] <-
|
81
|
+
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
75
82
|
}
|
76
83
|
write.table(classif, paste(out_base,"classif",sep="."),
|
77
84
|
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
@@ -106,19 +113,26 @@ generate_empty_files <- function(out_base) {
|
|
106
113
|
file.create(paste(out_base,".1.medoids",sep=""))
|
107
114
|
}
|
108
115
|
|
109
|
-
plot_silhouette <- function(k, s, ds, top.n) {
|
116
|
+
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
117
|
+
# s
|
110
118
|
par(mar=c(4,5,1,5)+0.1)
|
111
119
|
plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
|
112
120
|
ylim=range(s), bty="n", xaxs="i", yaxt="n")
|
113
121
|
polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
|
114
122
|
axis(2, fg="grey60", col.axis="grey60")
|
115
123
|
mtext("Mean silhouette", side=2, line=3, col="grey60")
|
124
|
+
# ns
|
116
125
|
par(new=TRUE)
|
117
126
|
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
118
|
-
ylim=range(
|
119
|
-
points(k,
|
127
|
+
ylim=range(ns), bty="n", xaxs="i")
|
128
|
+
points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
|
120
129
|
axis(4, fg="darkred", col.axis="darkred")
|
121
130
|
mtext("Negative silhouette area", side=4, line=3, col="darkred")
|
131
|
+
# ds
|
132
|
+
par(new=TRUE)
|
133
|
+
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
134
|
+
ylim=range(ds), bty="n", xaxs="i")
|
135
|
+
lines(k, ds)
|
122
136
|
abline(v=top.n, lty=2)
|
123
137
|
}
|
124
138
|
|
@@ -134,8 +148,18 @@ plot_clustering <- function(cl, dist, types) {
|
|
134
148
|
plot(silhouette(cl), col=col)
|
135
149
|
if(length(labels(dist))<=15){
|
136
150
|
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
151
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
137
152
|
}else{
|
138
|
-
|
153
|
+
ani.mds <- cmdscale(dist, k=4)
|
154
|
+
if(ncol(ani.mds)==4){
|
155
|
+
plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
|
156
|
+
xlab='Component 1', ylab='Component 2')
|
157
|
+
plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
|
158
|
+
xlab='Component 3', ylab='Component 4')
|
159
|
+
}else{
|
160
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
161
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
162
|
+
}
|
139
163
|
}
|
140
164
|
}
|
141
165
|
|
@@ -144,6 +168,8 @@ plot_tree <- function(phy, types, medoids){
|
|
144
168
|
top.n <- length(unique(types))
|
145
169
|
col <- ggplotColours(top.n)
|
146
170
|
is.medoid <- phy$tip.label %in% medoids
|
171
|
+
phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
|
172
|
+
" [", types[phy$tip.label[is.medoid]], "]", sep='')
|
147
173
|
plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
|
148
174
|
font=ifelse(is.medoid, 2, 1),
|
149
175
|
tip.color=col[types[phy$tip.label]])
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.2.
|
4
|
+
version: 0.2.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 1.2.4
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ~>
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 1.2.4
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: json
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -140,6 +140,8 @@ files:
|
|
140
140
|
- utils/plot-taxdist.R
|
141
141
|
- utils/requirements.txt
|
142
142
|
- utils/subclades-compile.rb
|
143
|
+
- utils/subclades-nj.R
|
144
|
+
- utils/subclades-pam.R
|
143
145
|
- utils/subclades.R
|
144
146
|
- bin/miga
|
145
147
|
- actions/add_result.rb
|
@@ -154,6 +156,7 @@ files:
|
|
154
156
|
- actions/index_taxonomy.rb
|
155
157
|
- actions/list_datasets.rb
|
156
158
|
- actions/list_files.rb
|
159
|
+
- actions/plugins.rb
|
157
160
|
- actions/project_info.rb
|
158
161
|
- actions/result_stats.rb
|
159
162
|
- actions/tax_distributions.rb
|