miga-base 0.2.1.4 → 0.2.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/actions/create_dataset.rb +18 -0
- data/lib/miga/dataset.rb +4 -1
- data/lib/miga/metadata.rb +2 -2
- data/lib/miga/remote_dataset.rb +2 -1
- data/lib/miga/tax_dist.rb +2 -2
- data/lib/miga/version.rb +1 -1
- data/scripts/_distances_functions.bash +13 -0
- data/scripts/_distances_noref_nomulti.bash +58 -76
- data/scripts/haai_distances.bash +15 -9
- data/scripts/ogs.bash +3 -0
- data/utils/subclades.R +16 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5145851e5906de7cafb98f5c701c69caca6404cc
|
4
|
+
data.tar.gz: 745cb4dcc63e62c3d054cae99a37a22a6dbdb80e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71308be2992a9f78776f618afb2d8e8f3f4055d77d2cada696772130b2570468f7031fc3cee04371ebe721c8fd3e598d302cd20dda85fe4d43ddb850d1bdc213
|
7
|
+
data.tar.gz: 0ff19642a0dcd5a679adb9562988a731d631c811551400c02884b8a47ba39072f23efc1f5a1a651f87cdfd01d2c962464e2821965dc0a06b81202df2ee2f9d5b
|
data/actions/create_dataset.rb
CHANGED
@@ -16,6 +16,10 @@ OptionParser.new do |opt|
|
|
16
16
|
"Owner of the dataset."){ |v| o[:user]=v }
|
17
17
|
opt.on("-c", "--comments STRING",
|
18
18
|
"Comments on the dataset."){ |v| o[:comments]=v }
|
19
|
+
opt.on("-m", "--metadata STRING",
|
20
|
+
"Metadata as key-value pairs separated by = and delimited by comma.",
|
21
|
+
"Values are saved as strings except for booleans (true / false) or nil."
|
22
|
+
){ |v| o[:metadata]=v }
|
19
23
|
opt.on("--update",
|
20
24
|
"Updates the dataset if it already exists."){ o[:update]=true }
|
21
25
|
opt_common(opt, o)
|
@@ -34,6 +38,20 @@ $stderr.puts "Loading dataset." unless o[:q]
|
|
34
38
|
d = o[:update] ? p.dataset(o[:dataset]) :
|
35
39
|
MiGA::Dataset.new(p, o[:dataset], o[:ref], {})
|
36
40
|
raise "Dataset does not exist." if d.nil?
|
41
|
+
unless o[:metadata].nil?
|
42
|
+
o[:metadata].split(",").each do |pair|
|
43
|
+
(k,v) = pair.split("=")
|
44
|
+
case v
|
45
|
+
when "true"
|
46
|
+
v = true
|
47
|
+
when "false"
|
48
|
+
v = false
|
49
|
+
when "nil"
|
50
|
+
v = nil
|
51
|
+
end
|
52
|
+
d.metadata[k] = v
|
53
|
+
end
|
54
|
+
end
|
37
55
|
[:type, :description, :user, :comments].each do |k|
|
38
56
|
d.metadata[k]=o[k] unless o[k].nil?
|
39
57
|
end
|
data/lib/miga/dataset.rb
CHANGED
@@ -191,7 +191,9 @@ class MiGA::Dataset < MiGA::MiGA
|
|
191
191
|
# execution order). This typically corresponds to the result used as the
|
192
192
|
# initial input. Passes +save+ to #add_result.
|
193
193
|
def first_preprocessing(save=false)
|
194
|
-
@@PREPROCESSING_TASKS.find
|
194
|
+
@@PREPROCESSING_TASKS.find do |t|
|
195
|
+
not ignore_task?(t) and not add_result(t, save).nil?
|
196
|
+
end
|
195
197
|
end
|
196
198
|
|
197
199
|
##
|
@@ -212,6 +214,7 @@ class MiGA::Dataset < MiGA::MiGA
|
|
212
214
|
##
|
213
215
|
# Should I ignore +task+ for this dataset?
|
214
216
|
def ignore_task?(task)
|
217
|
+
return !metadata["run_#{task}"] unless metadata["run_#{task}"].nil?
|
215
218
|
( (@@EXCLUDE_NOREF_TASKS.include?(task) and not is_ref?) or
|
216
219
|
(@@ONLY_MULTI_TASKS.include?(task) and not is_multi?) or
|
217
220
|
(@@ONLY_NONMULTI_TASKS.include?(task) and not is_nonmulti?))
|
data/lib/miga/metadata.rb
CHANGED
@@ -107,8 +107,8 @@ class MiGA::Metadata < MiGA::MiGA
|
|
107
107
|
v=v.miga_name if k==:name
|
108
108
|
# Symbolize the special field :type
|
109
109
|
v=v.to_sym if k==:type
|
110
|
-
#
|
111
|
-
@data[k]=v
|
110
|
+
# Delete if nil, register, and return
|
111
|
+
v.nil? ? @data.delete(k) : (@data[k]=v)
|
112
112
|
end
|
113
113
|
|
114
114
|
##
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -130,7 +130,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
130
130
|
else
|
131
131
|
download("#{base}.LargeContigs.fna")
|
132
132
|
end
|
133
|
-
File.symlink(
|
133
|
+
File.symlink(
|
134
|
+
File.basename("#{base}.LargeContigs.fna"), "#{base}.AllContigs.fna")
|
134
135
|
File.open("#{base}.done", "w") { |ofh| ofh.puts Time.now.to_s }
|
135
136
|
else
|
136
137
|
raise "Unexpected error: Unsupported result for database #{db}."
|
data/lib/miga/tax_dist.rb
CHANGED
@@ -53,9 +53,9 @@ module MiGA::TaxDist
|
|
53
53
|
out = {}
|
54
54
|
meaning.each do |phrase, thresholds|
|
55
55
|
lwr, upr = thresholds
|
56
|
-
min = pv.values.select{ |v| v
|
56
|
+
min = pv.values.select{ |v| v < upr }.max
|
57
57
|
return out if min.nil?
|
58
|
-
if min
|
58
|
+
if min >= lwr
|
59
59
|
v = pv.select{ |_,v| v==min }
|
60
60
|
out[phrase] = (test==:intax ? v.reverse_each : v).first
|
61
61
|
end
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.2, 1,
|
13
|
+
VERSION = [0.2, 1, 5]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -62,6 +62,19 @@ function haai {
|
|
62
62
|
fi
|
63
63
|
}
|
64
64
|
|
65
|
+
function haai_or_aai {
|
66
|
+
local FH1=$1
|
67
|
+
local FH2=$2
|
68
|
+
local DBH=$3
|
69
|
+
local F1=$4
|
70
|
+
local F2=$5
|
71
|
+
local DB=$6
|
72
|
+
local TH=$7
|
73
|
+
AAI=$(haai $FH1 $FH2 $TH $DBH $DB)
|
74
|
+
[[ "${AAI%.*}" -le 0 ]] && AAI=$(aai $F1 $F2 $TH $DB)
|
75
|
+
echo $AAI
|
76
|
+
}
|
77
|
+
|
65
78
|
function val_from_db {
|
66
79
|
local N1=$1
|
67
80
|
local N2=$2
|
@@ -24,91 +24,73 @@ function checkpoint_n {
|
|
24
24
|
fi
|
25
25
|
}
|
26
26
|
|
27
|
+
function noref_haai_or_aai {
|
28
|
+
local Q=$1
|
29
|
+
local S=$2
|
30
|
+
haai_or_aai $ESS/$Q.ess.faa $ESS/$S.ess.faa $TMPDIR/$Q.haai.db \
|
31
|
+
../06.cds/$Q.faa ../06.cds/$S.faa $TMPDIR/$Q.aai.db $CORES
|
32
|
+
}
|
33
|
+
|
34
|
+
function noref_ani {
|
35
|
+
local Q=$1
|
36
|
+
local S=$2
|
37
|
+
ani ../05.assembly/$Q.LargeContigs.fna ../05.assembly/$S.LargeContigs.fna \
|
38
|
+
$CORES $TMPDIR/$Q.ani.db
|
39
|
+
}
|
40
|
+
|
27
41
|
ESS="../07.annotation/01.function/01.essential"
|
28
42
|
if [[ $(miga project_info -P "$PROJECT" -m type) != "clade" ]] ; then
|
29
43
|
# Classify aai-clade (if project type is not clade)
|
30
44
|
CLADES="../10.clades/01.find"
|
31
|
-
|
32
|
-
[[ -e "$DATASET.aai-medoids.tsv" ]] && rm "$DATASET.aai-medoids.tsv"
|
33
|
-
while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
|
34
|
-
MAX_AAI=0
|
35
|
-
AAI_MED=""
|
36
|
-
AAI_CLS=""
|
37
|
-
i_n=0
|
38
|
-
for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
|
39
|
-
let i_n=$i_n+1
|
40
|
-
AAI=$(haai $ESS/$DATASET.ess.faa $ESS/$i.ess.faa $CORES \
|
41
|
-
$TMPDIR/$DATASET.haai.db $TMPDIR/$DATASET.aai.db)
|
42
|
-
[[ "${AAI%.*}" -le 0 ]] \
|
43
|
-
&& AAI=$(aai ../06.cds/$DATASET.faa ../06.cds/$i.faa $CORES \
|
44
|
-
$TMPDIR/$DATASET.aai.db)
|
45
|
-
checkpoint_n
|
46
|
-
if [[ $(perl -e "print 1 if '$AAI' >= '$MAX_AAI'") == "1" ]] ; then
|
47
|
-
MAX_AAI=$AAI
|
48
|
-
AAI_MED=$i
|
49
|
-
AAI_CLS=$i_n
|
50
|
-
echo "[$CLASSIF] New max: $AAI_MED ($AAI_CLS): $MAX_AAI"
|
51
|
-
fi
|
52
|
-
done
|
53
|
-
CLASSIF="$CLASSIF/miga-project.sc-$AAI_CLS"
|
54
|
-
echo "$AAI_CLS $AAI_MED $MAX_AAI $CLASSIF" \
|
55
|
-
>> "$DATASET.aai-medoids.tsv"
|
56
|
-
done
|
57
|
-
|
58
|
-
# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
|
59
|
-
if [[ "$CLASSIF" != "." ]] ; then
|
60
|
-
PAR=$(dirname "$CLADES/$CLASSIF")/miga-project.classif
|
61
|
-
if [[ -s "$PAR" ]] ; then
|
62
|
-
for i in $(cat "$PAR" | awk "\$2==$AAI_CLS{print \$1}") ; do
|
63
|
-
AAI=$(aai ../06.cds/$DATASET.faa ../06.cds/$i.faa $CORES \
|
64
|
-
$TMPDIR/$DATASET.aai.db)
|
65
|
-
if [[ $(perl -e "print 1 if '$AAI' >= 90") == "1" ]] ; then
|
66
|
-
ani ../05.assembly/$DATASET.LargeContigs.fna \
|
67
|
-
../05.assembly/$i.LargeContigs.fna \
|
68
|
-
$TMPDIR/$DATASET.ani.db >/dev/null
|
69
|
-
fi
|
70
|
-
checkpoint_n
|
71
|
-
done
|
72
|
-
fi
|
73
|
-
fi
|
45
|
+
METRIC="aai"
|
74
46
|
else
|
75
47
|
# Classify ani-clade (if project type is clade)
|
76
48
|
CLADES="../10.clades/02.ani"
|
77
|
-
|
78
|
-
|
79
|
-
while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
|
80
|
-
MAX_ANI=0
|
81
|
-
ANI_MED=""
|
82
|
-
ANI_CLS=""
|
83
|
-
i_n=0
|
84
|
-
for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
|
85
|
-
let i_n=$i_n+1
|
86
|
-
ANI=$(ani ../05.assembly/$DATASET.LargeContigs.fna \
|
87
|
-
../05.assembly/$i.LargeContigs.fna $CORES $TMPDIR/$DATASET.ani.db)
|
88
|
-
checkpoint_n
|
89
|
-
if [[ $(perl -e "print 1 if '$ANI' >= '$MAX_ANI'") == "1" ]] ; then
|
90
|
-
MAX_ANI=$ANI
|
91
|
-
ANI_MED=$i
|
92
|
-
ANI_CLS=$i_n
|
93
|
-
echo "[$CLASSIF] New max: $ANI_MED ($ANI_CLS): $MAX_ANI"
|
94
|
-
fi
|
95
|
-
done
|
96
|
-
CLASSIF="$CLASSIF/miga-project.sc-$ANI_CLS"
|
97
|
-
echo "$ANI_CLS $ANI_MED $MAX_ANI $CLASSIF" \
|
98
|
-
>> "$DATASET.ani-medoids.tsv"
|
99
|
-
done
|
49
|
+
METRIC="ani"
|
50
|
+
fi
|
100
51
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
52
|
+
CLASSIF="."
|
53
|
+
[[ -e "$DATASET.$METRIC-medoids.tsv" ]] && rm "$DATASET.$METRIC-medoids.tsv"
|
54
|
+
while [[ -e "$CLADES/$CLASSIF/miga-project.medoids" ]] ; do
|
55
|
+
MAX_VAL=0
|
56
|
+
VAL_MED=""
|
57
|
+
VAL_CLS=""
|
58
|
+
i_n=0
|
59
|
+
for i in $(cat "$CLADES/$CLASSIF/miga-project.medoids") ; do
|
60
|
+
let i_n=$i_n+1
|
61
|
+
if [[ $METRIC == "aai" ]] ; then
|
62
|
+
VAL=$(noref_haai_or_aai $DATASET $i)
|
63
|
+
else
|
64
|
+
VAL=$(noref_ani $DATASET $i)
|
65
|
+
fi
|
66
|
+
checkpoint_n
|
67
|
+
if [[ $(perl -e "print 1 if '$VAL' >= '$MAX_VAL'") == "1" ]] ; then
|
68
|
+
MAX_VAL=$VAL
|
69
|
+
VAL_MED=$i
|
70
|
+
VAL_CLS=$i_n
|
71
|
+
echo "[$CLASSIF] New max: $VAL_MED ($VAL_CLS): $MAX_VAL"
|
111
72
|
fi
|
73
|
+
done
|
74
|
+
CLASSIF="$CLASSIF/miga-project.sc-$VAL_CLS"
|
75
|
+
echo "$VAL_CLS $VAL_MED $MAX_VAL $CLASSIF" \
|
76
|
+
>> "$DATASET.$METRIC-medoids.tsv"
|
77
|
+
done
|
78
|
+
|
79
|
+
# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
|
80
|
+
if [[ "$CLASSIF" != "." ]] ; then
|
81
|
+
PAR=$(dirname "$CLADES/$CLASSIF")/miga-project.classif
|
82
|
+
if [[ -s "$PAR" ]] ; then
|
83
|
+
for i in $(cat "$PAR" | awk "\$2==$VAL_CLS{print \$1}") ; do
|
84
|
+
if [[ $METRIC == "aai" ]] ; then
|
85
|
+
AAI=$(noref_haai_or_aai $DATASET $i)
|
86
|
+
else
|
87
|
+
AAI=100
|
88
|
+
fi
|
89
|
+
if [[ $(perl -e "print 1 if '$AAI' >= 90") == "1" ]] ; then
|
90
|
+
noref_ani $DATASET $i
|
91
|
+
fi
|
92
|
+
checkpoint_n
|
93
|
+
done
|
112
94
|
fi
|
113
95
|
fi
|
114
96
|
|
data/scripts/haai_distances.bash
CHANGED
@@ -21,15 +21,21 @@ for i in $DS ; do
|
|
21
21
|
done
|
22
22
|
|
23
23
|
# R-ify
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
24
|
+
if true ; then
|
25
|
+
echo "
|
26
|
+
haai <- read.table('miga-project.txt', sep='\\t', h=T);
|
27
|
+
save(haai, file='miga-project.Rdata');"
|
28
|
+
if [[ $(cat miga-project.txt | wc -l) -gt 1 ]] ; then
|
29
|
+
echo "
|
30
|
+
h <- hist(haai[,'value'], breaks=100, plot=FALSE);
|
31
|
+
write.table(
|
32
|
+
cbind(h[['breaks']][-length(h[['breaks']])],
|
33
|
+
h[['breaks']][-1],h[['counts']]),
|
34
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
35
|
+
col.names=FALSE, row.names=FALSE);
|
36
|
+
"
|
37
|
+
fi
|
38
|
+
fi | R --vanilla
|
33
39
|
|
34
40
|
# Gzip
|
35
41
|
gzip -9 -f miga-project.txt
|
data/scripts/ogs.bash
CHANGED
@@ -31,6 +31,9 @@ done
|
|
31
31
|
ogs.mcl.rb -o miga-project.ogs -d miga-project.rbm -t $CORES
|
32
32
|
ogs.stats.rb -o miga-project.ogs -j miga-project.stats
|
33
33
|
|
34
|
+
# Clean RBMs
|
35
|
+
rm -rf miga-project.rbm
|
36
|
+
|
34
37
|
# Finalize
|
35
38
|
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
36
39
|
miga add_result -P "$PROJECT" -r ogs
|
data/utils/subclades.R
CHANGED
@@ -41,15 +41,16 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
41
41
|
cl <- makeCluster(thr)
|
42
42
|
s <- parSapply(cl, k, function(x) {
|
43
43
|
library(cluster)
|
44
|
-
pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
|
44
|
+
s <- pam(ani.d, x, do.swap=FALSE, pamonce=1)$silinfo
|
45
|
+
c(s$avg.width, -sum(ifelse(s$widths[,3]>0,0,s$widths[,3])))
|
45
46
|
})
|
46
47
|
stopCluster(cl)
|
47
|
-
ds <-
|
48
|
-
top.n <-
|
48
|
+
ds <- s[1,]/s[2,]
|
49
|
+
top.n <- k[which.max(ds)]
|
49
50
|
|
50
51
|
# Classify genomes
|
51
52
|
say("Classify")
|
52
|
-
ani.cl <- pam(ani.d, top.n)
|
53
|
+
ani.cl <- pam(ani.d, top.n, pamonce=1)
|
53
54
|
ani.types <- ani.cl$clustering
|
54
55
|
ani.medoids <- ani.cl$medoids
|
55
56
|
|
@@ -58,7 +59,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
58
59
|
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
59
60
|
layout(1:4)
|
60
61
|
plot_distances(ani.d)
|
61
|
-
plot_silhouette(k, s,
|
62
|
+
plot_silhouette(k, s[1,], s[2,], top.n)
|
62
63
|
plot_clustering(ani.cl, ani.d, ani.types)
|
63
64
|
plot_tree(ani.ph, ani.types, ani.medoids)
|
64
65
|
dev.off()
|
@@ -67,6 +68,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
67
68
|
say("Text report")
|
68
69
|
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
69
70
|
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
71
|
+
save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
|
70
72
|
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
71
73
|
for(j in 1:nrow(classif)){
|
72
74
|
classif[j,4] <- 100 - as.matrix(ani.d)[classif[j,1], classif[j,3]]
|
@@ -75,6 +77,7 @@ subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
|
75
77
|
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
76
78
|
|
77
79
|
# Recursive search
|
80
|
+
say("Recursive search")
|
78
81
|
for(i in 1:top.n){
|
79
82
|
medoid <- ani.medoids[i]
|
80
83
|
ds_f <- names(ani.types)[ ani.types==i ]
|
@@ -109,13 +112,13 @@ plot_silhouette <- function(k, s, ds, top.n) {
|
|
109
112
|
ylim=range(s), bty="n", xaxs="i", yaxt="n")
|
110
113
|
polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
|
111
114
|
axis(2, fg="grey60", col.axis="grey60")
|
112
|
-
mtext("
|
115
|
+
mtext("Mean silhouette", side=2, line=3, col="grey60")
|
113
116
|
par(new=TRUE)
|
114
117
|
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
115
118
|
ylim=range(ds), bty="n", xaxs="i")
|
116
|
-
points(k
|
119
|
+
points(k, ds, type="o", pch=16, col=rgb(1/2,0,0,3/4))
|
117
120
|
axis(4, fg="darkred", col.axis="darkred")
|
118
|
-
mtext("
|
121
|
+
mtext("Negative silhouette area", side=4, line=3, col="darkred")
|
119
122
|
abline(v=top.n, lty=2)
|
120
123
|
}
|
121
124
|
|
@@ -129,7 +132,11 @@ plot_clustering <- function(cl, dist, types) {
|
|
129
132
|
top.n <- length(cl$medoids)
|
130
133
|
col <- ggplotColours(top.n)
|
131
134
|
plot(silhouette(cl), col=col)
|
132
|
-
|
135
|
+
if(length(labels(dist))<=15){
|
136
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
137
|
+
}else{
|
138
|
+
clusplot(cl, dist=dist, main="", col.p=col[types], lines=0)
|
139
|
+
}
|
133
140
|
}
|
134
141
|
|
135
142
|
plot_tree <- function(phy, types, medoids){
|