miga-base 0.2.2.1 → 0.2.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +4 -1
- data/actions/create_dataset.rb +2 -5
- data/actions/daemon.rb +1 -0
- data/actions/plugins.rb +25 -0
- data/actions/result_stats.rb +10 -0
- data/bin/miga +1 -0
- data/lib/miga/daemon.rb +12 -4
- data/lib/miga/dataset.rb +4 -3
- data/lib/miga/project.rb +38 -4
- data/lib/miga/remote_dataset.rb +2 -2
- data/lib/miga/version.rb +1 -1
- data/scripts/_distances_functions.bash +20 -20
- data/scripts/_distances_noref_nomulti.bash +20 -13
- data/scripts/_distances_ref_nomulti.bash +11 -10
- data/scripts/aai_distances.bash +15 -12
- data/scripts/ani_distances.bash +14 -11
- data/scripts/assembly.bash +2 -1
- data/scripts/cds.bash +2 -2
- data/scripts/clade_finding.bash +2 -1
- data/scripts/distances.bash +2 -2
- data/scripts/essential_genes.bash +14 -4
- data/scripts/haai_distances.bash +17 -20
- data/scripts/init.bash +1 -1
- data/scripts/miga.bash +6 -0
- data/scripts/mytaxa.bash +2 -2
- data/scripts/mytaxa_scan.bash +2 -2
- data/scripts/ogs.bash +2 -2
- data/scripts/read_quality.bash +2 -2
- data/scripts/ssu.bash +2 -2
- data/scripts/stats.bash +3 -2
- data/scripts/subclades.bash +2 -2
- data/scripts/trimmed_fasta.bash +2 -2
- data/scripts/trimmed_reads.bash +2 -2
- data/test/daemon_test.rb +1 -1
- data/test/test_helper.rb +2 -2
- data/utils/subclades-nj.R +244 -0
- data/utils/subclades-pam.R +186 -0
- data/utils/subclades.R +39 -13
- metadata +6 -3
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="essential"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -8,6 +9,16 @@ cd "$PROJECT/data/07.annotation/01.function/01.essential"
|
|
8
9
|
|
9
10
|
# Initialize
|
10
11
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
12
|
+
FAA="../../../06.cds/$DATASET.faa"
|
13
|
+
|
14
|
+
# Check if there are any proteins
|
15
|
+
if [[ ! -s $FAA ]] ; then
|
16
|
+
echo Empty protein set, bypassing essential genes
|
17
|
+
rm "$DATASET.start"
|
18
|
+
miga create_dataset -P "$PROJECT" -D $DATASET \
|
19
|
+
-m run_essential_genes=false --update
|
20
|
+
exit 0
|
21
|
+
fi
|
11
22
|
|
12
23
|
# Find and extract essential genes
|
13
24
|
[[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
|
@@ -15,16 +26,15 @@ mkdir "$DATASET.ess"
|
|
15
26
|
TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
|
16
27
|
--metadata "type" | awk '{print $2}')
|
17
28
|
if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
|
18
|
-
HMM.essential.rb -i "
|
29
|
+
HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
|
19
30
|
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
20
31
|
> "$DATASET.ess/log"
|
21
32
|
else
|
22
|
-
HMM.essential.rb -i "
|
33
|
+
HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
|
23
34
|
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
|
24
35
|
> "$DATASET.ess/log"
|
25
36
|
fi
|
26
37
|
|
27
38
|
# Finalize
|
28
39
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
29
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
30
|
-
|
40
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/haai_distances.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="haai_distances"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -15,32 +16,28 @@ DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
|
|
15
16
|
# Extract values
|
16
17
|
echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
|
17
18
|
for i in $DS ; do
|
18
|
-
|
19
|
-
|
20
|
-
|
19
|
+
echo "SELECT 'hAAI', seq1, seq2, aai, sd, n, omega from aai ;" \
|
20
|
+
| sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
|
21
|
+
echo "$i" >> miga-project.log
|
21
22
|
done
|
22
23
|
|
23
24
|
# R-ify
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
h
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
"
|
37
|
-
fi
|
38
|
-
fi | R --vanilla
|
25
|
+
echo "
|
26
|
+
haai <- read.table('miga-project.txt', sep='\\t', h=T, as.is=TRUE);
|
27
|
+
save(haai, file='miga-project.Rdata');
|
28
|
+
if(sum(haai[,'a'] != haai[,'b']) > 0){
|
29
|
+
h <- hist(haai[haai[,'a'] != haai[,'b'], 'value'], breaks=100, plot=FALSE);
|
30
|
+
write.table(
|
31
|
+
cbind(h[['breaks']][-length(h[['breaks']])],
|
32
|
+
h[['breaks']][-1],h[['counts']]),
|
33
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
34
|
+
col.names=FALSE, row.names=FALSE);
|
35
|
+
}
|
36
|
+
" | R --vanilla
|
39
37
|
|
40
38
|
# Gzip
|
41
39
|
gzip -9 -f miga-project.txt
|
42
40
|
|
43
41
|
# Finalize
|
44
42
|
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
45
|
-
miga add_result -P "$PROJECT" -r
|
46
|
-
|
43
|
+
miga add_result -P "$PROJECT" -r "$SCRIPT"
|
data/scripts/init.bash
CHANGED
@@ -106,7 +106,7 @@ echo "export PATH=\$MIGA_PATH\$PATH" >> "$HOME/.miga_rc"
|
|
106
106
|
# Check for R packages
|
107
107
|
echo "
|
108
108
|
Looking for R packages:" >&2
|
109
|
-
RLIBS="enveomics.R ape ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
|
109
|
+
RLIBS="enveomics.R ape phangorn phytools ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
|
110
110
|
for lib in $RLIBS ; do
|
111
111
|
if ! check_rlib $lib ; then
|
112
112
|
echo "+ Installing $lib" >&2
|
data/scripts/miga.bash
CHANGED
@@ -3,8 +3,14 @@ set -e
|
|
3
3
|
#MIGA=${MIGA:-$(cd "$(dirname "$0")/.."; pwd)}
|
4
4
|
source "$HOME/.miga_rc"
|
5
5
|
export PATH="$MIGA/bin:$PATH"
|
6
|
+
SCRIPT=${SCRIPT:-$(basename $0 .bash)}
|
6
7
|
|
7
8
|
function exists { [[ -e "$1" ]] ; }
|
9
|
+
function fx_exists { [[ $(type -t $1) == "function" ]] ; }
|
10
|
+
|
11
|
+
for i in $(miga plugins -P "$PROJECT") ; do
|
12
|
+
source "$i/scripts-plugin.bash"
|
13
|
+
done
|
8
14
|
|
9
15
|
#if [[ "$RUNTYPE" == "qsub" ]] ; then
|
10
16
|
#elif [[ "$RUNTYPE" == "msub" ]] ; then
|
data/scripts/mytaxa.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="mytaxa"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -90,5 +91,4 @@ fi
|
|
90
91
|
|
91
92
|
# Finalize
|
92
93
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
93
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
94
|
-
|
94
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/mytaxa_scan.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="mytaxa_scan"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -82,5 +83,4 @@ fi
|
|
82
83
|
# Finalize
|
83
84
|
rm -R "$TMPDIR"
|
84
85
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
85
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
86
|
-
|
86
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/ogs.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="ogs"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -36,5 +37,4 @@ rm -rf miga-project.rbm
|
|
36
37
|
|
37
38
|
# Finalize
|
38
39
|
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
39
|
-
miga add_result -P "$PROJECT" -r
|
40
|
-
|
40
|
+
miga add_result -P "$PROJECT" -r "$SCRIPT"
|
data/scripts/read_quality.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="read_quality"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -34,5 +35,4 @@ exists ../02.trimmed_reads/$b.[12].*.pdf \
|
|
34
35
|
|
35
36
|
# Finalize
|
36
37
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
37
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
38
|
-
|
38
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/ssu.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="ssu"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -32,5 +33,4 @@ fi
|
|
32
33
|
|
33
34
|
# Finalize
|
34
35
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
35
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
36
|
-
|
36
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/stats.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES, $DATASET
|
3
3
|
set -e
|
4
|
+
SCRIPT="stats"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -12,11 +13,11 @@ cd "$DIR"
|
|
12
13
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
13
14
|
|
14
15
|
# Calculate statistics
|
15
|
-
for i in raw_reads trimmed_fasta assembly cds ; do
|
16
|
+
for i in raw_reads trimmed_fasta assembly cds essential_genes ; do
|
16
17
|
echo "# $i"
|
17
18
|
miga result_stats --compute-and-save -P "$PROJECT" -D "$DATASET" -r $i
|
18
19
|
done
|
19
20
|
|
20
21
|
# Finalize
|
21
22
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
22
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
23
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/subclades.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="subclades"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -22,5 +23,4 @@ ruby "$MIGA/utils/subclades-compile.rb" . \
|
|
22
23
|
|
23
24
|
# Finalize
|
24
25
|
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
25
|
-
miga add_result -P "$PROJECT" -r
|
26
|
-
|
26
|
+
miga add_result -P "$PROJECT" -r "$SCRIPT"
|
data/scripts/trimmed_fasta.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="trimmed_fasta"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -44,5 +45,4 @@ done
|
|
44
45
|
|
45
46
|
# Finalize
|
46
47
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
47
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
48
|
-
|
48
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/trimmed_reads.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="trimmed_reads"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -54,5 +55,4 @@ rm $b.[12].*.discard &>/dev/null
|
|
54
55
|
|
55
56
|
# Finalize
|
56
57
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
57
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
58
|
-
|
58
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/test/daemon_test.rb
CHANGED
@@ -72,7 +72,7 @@ class DaemonTest < Test::Unit::TestCase
|
|
72
72
|
d.runopts(:latency, 0, true)
|
73
73
|
assert_equal(0, d.latency)
|
74
74
|
omit_if($jruby_tests, "JRuby doesn't implement fork.")
|
75
|
-
$child = fork { d.start }
|
75
|
+
$child = fork { d.start(["--shush"]) }
|
76
76
|
sleep(3)
|
77
77
|
dpath = File.expand_path("daemon/MiGA:#{p.name}",p.path)
|
78
78
|
assert(File.exist?("#{dpath}.pid"))
|
data/test/test_helper.rb
CHANGED
@@ -0,0 +1,244 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
#
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
#
|
6
|
+
|
7
|
+
#= Load stuff
|
8
|
+
argv <- commandArgs(trailingOnly=T)
|
9
|
+
suppressPackageStartupMessages(library(ape))
|
10
|
+
suppressPackageStartupMessages(library(vegan))
|
11
|
+
suppressPackageStartupMessages(library(cluster))
|
12
|
+
suppressPackageStartupMessages(library(phytools))
|
13
|
+
suppressPackageStartupMessages(library(phangorn))
|
14
|
+
suppressPackageStartupMessages(library(parallel))
|
15
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
16
|
+
|
17
|
+
#= Main function
|
18
|
+
subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
19
|
+
say("==> Out base:", out_base, "<==")
|
20
|
+
|
21
|
+
# Input arguments
|
22
|
+
if(missing(ani_file)){
|
23
|
+
a <- as.data.frame(ani)
|
24
|
+
}else{
|
25
|
+
a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
|
26
|
+
}
|
27
|
+
if(nrow(a)==0){
|
28
|
+
generate_empty_files(out_base)
|
29
|
+
return(NULL)
|
30
|
+
}
|
31
|
+
|
32
|
+
# Get ANI distances
|
33
|
+
say("Distances")
|
34
|
+
a$d <- 1-a$value/100
|
35
|
+
ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.2)
|
36
|
+
ani.ph <- midpoint(bionj(ani.d))
|
37
|
+
express.ori <- options('expressions')$expressions
|
38
|
+
if(express.ori < ani.ph$Nnode*4){
|
39
|
+
options(expressions=min(c(5e7,ani.ph$Nnode*4)))
|
40
|
+
}
|
41
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
|
42
|
+
options(expressions=express.ori)
|
43
|
+
ani.ph$edge.length[ ani.ph$edge.length<0 ] <- 0
|
44
|
+
ani.cpd <- cophenetic(ani.ph)
|
45
|
+
# Transform phylogenetic tree for clustering
|
46
|
+
ani.hcl <- as.hclust(
|
47
|
+
# 3. Randomly split multifurcations
|
48
|
+
multi2di(
|
49
|
+
# 2. Coalescent
|
50
|
+
compute.brtime(
|
51
|
+
# 1. Collapse zero-length and negative branches
|
52
|
+
di2multi(ani.ph, tol=min(ani.ph$edge.length[ani.ph$edge.length>0])))))
|
53
|
+
|
54
|
+
# Silhouette
|
55
|
+
say("Silhouette")
|
56
|
+
k <- 2:min(length(labels(ani.d))-1, 100)
|
57
|
+
s <- sapply(k, function(x) {
|
58
|
+
library(cluster)
|
59
|
+
cl <- cutree(ani.hcl, k=x)
|
60
|
+
s <- silhouette(cl, dmatrix=ani.cpd)
|
61
|
+
c(mean(s[,'sil_width']),
|
62
|
+
-sum(ifelse(s[,'sil_width']>0,0,s[,'sil_width'])))
|
63
|
+
})
|
64
|
+
s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
|
65
|
+
s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
|
66
|
+
ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
|
67
|
+
top.n <- k[which.max(ds)]
|
68
|
+
|
69
|
+
# Classify genomes
|
70
|
+
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
71
|
+
ani.types <- cutree(ani.hcl, k=top.n)
|
72
|
+
ani.medoids <- sapply(unique(ani.types),
|
73
|
+
clust.medoid, as.matrix(ani.d), ani.types)
|
74
|
+
|
75
|
+
# Generate graphic report
|
76
|
+
say("Graphic report")
|
77
|
+
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
78
|
+
layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
|
79
|
+
plot_distances(ani.d)
|
80
|
+
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
81
|
+
plot_clustering(ani.hcl, ani.d, ani.types, ani.medoids)
|
82
|
+
plot_tree(ani.ph, ani.types, ani.medoids)
|
83
|
+
dev.off()
|
84
|
+
|
85
|
+
# Save results
|
86
|
+
say("Text report")
|
87
|
+
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
88
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
89
|
+
save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
|
90
|
+
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
91
|
+
ani.d.m <- 100 - as.matrix(ani.d)*100
|
92
|
+
for(j in 1:nrow(classif)){
|
93
|
+
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
94
|
+
}
|
95
|
+
write.table(classif, paste(out_base,"classif",sep="."),
|
96
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
97
|
+
|
98
|
+
# Recursive search
|
99
|
+
say("Recursive search")
|
100
|
+
for(i in 1:top.n){
|
101
|
+
medoid <- ani.medoids[i]
|
102
|
+
ds_f <- names(ani.types)[ ani.types==i ]
|
103
|
+
say("Analyzing subclade", i, "with medoid:", medoid)
|
104
|
+
dir.create(paste(out_base, ".sc-", i, sep=""))
|
105
|
+
write.table(ds_f,
|
106
|
+
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
107
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
108
|
+
if(length(ds_f) > 5){
|
109
|
+
a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
|
110
|
+
subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
111
|
+
thr=thr, ani=a_f)
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
#= Helper functions
|
117
|
+
say <- function(...) { cat("[", date(), "]", ..., "\n") }
|
118
|
+
|
119
|
+
generate_empty_files <- function(out_base) {
|
120
|
+
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
121
|
+
plot(1, t="n", axes=F)
|
122
|
+
legend("center", "No data", bty="n")
|
123
|
+
dev.off()
|
124
|
+
file.create(paste(out_base,".1.classif",sep=""))
|
125
|
+
file.create(paste(out_base,".1.medoids",sep=""))
|
126
|
+
}
|
127
|
+
|
128
|
+
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
129
|
+
# s
|
130
|
+
par(mar=c(4,5,1,5)+0.1)
|
131
|
+
plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
|
132
|
+
ylim=range(s), bty="n", xaxs="i", yaxt="n")
|
133
|
+
polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
|
134
|
+
axis(2, fg="grey60", col.axis="grey60")
|
135
|
+
mtext("Mean silhouette", side=2, line=3, col="grey60")
|
136
|
+
# ns
|
137
|
+
par(new=TRUE)
|
138
|
+
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
139
|
+
ylim=range(ns), bty="n", xaxs="i")
|
140
|
+
points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
|
141
|
+
axis(4, fg="darkred", col.axis="darkred")
|
142
|
+
mtext("Negative silhouette area", side=4, line=3, col="darkred")
|
143
|
+
# ds
|
144
|
+
par(new=TRUE)
|
145
|
+
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
146
|
+
ylim=range(ds), bty="n", xaxs="i")
|
147
|
+
lines(k, ds)
|
148
|
+
abline(v=top.n, lty=2)
|
149
|
+
}
|
150
|
+
|
151
|
+
plot_distances <- function(dist) {
|
152
|
+
par(mar=c(5,4,1,2)+0.1)
|
153
|
+
hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
|
154
|
+
}
|
155
|
+
|
156
|
+
plot_clustering <- function(hcl, dist, types, medoids) {
|
157
|
+
par(mar=c(5,4,4,2)+0.1)
|
158
|
+
top.n <- length(medoids)
|
159
|
+
col <- ggplotColours(top.n)
|
160
|
+
plot(silhouette(types, dist=dist), col=col)
|
161
|
+
if(length(labels(dist))<=15){
|
162
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
163
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
164
|
+
}else{
|
165
|
+
ani.mds <- cmdscale(dist, k=4)
|
166
|
+
if(ncol(ani.mds)==4){
|
167
|
+
plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
|
168
|
+
xlab='Component 1', ylab='Component 2')
|
169
|
+
plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
|
170
|
+
xlab='Component 3', ylab='Component 4')
|
171
|
+
}else{
|
172
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
173
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
174
|
+
}
|
175
|
+
}
|
176
|
+
}
|
177
|
+
|
178
|
+
plot_tree <- function(phy, types, medoids){
|
179
|
+
layout(1)
|
180
|
+
top.n <- length(unique(types))
|
181
|
+
col <- ggplotColours(top.n)
|
182
|
+
is.medoid <- phy$tip.label %in% medoids
|
183
|
+
phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
|
184
|
+
" [", types[phy$tip.label[is.medoid]], "]", sep='')
|
185
|
+
plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
|
186
|
+
font=ifelse(is.medoid, 2, 1),
|
187
|
+
tip.color=col[types[phy$tip.label]])
|
188
|
+
}
|
189
|
+
|
190
|
+
ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
|
191
|
+
if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
|
192
|
+
hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
|
193
|
+
}
|
194
|
+
|
195
|
+
# Modified from https://www.biostars.org/p/11987/
|
196
|
+
clust.medoid <- function(i, distmat, clusters) {
|
197
|
+
ind <- (clusters == i)
|
198
|
+
if (sum(ind) <= 1){
|
199
|
+
return (rownames(distmat)[ind])
|
200
|
+
} else {
|
201
|
+
return(names(which.min(rowSums( distmat[ind, ind] ))))
|
202
|
+
}
|
203
|
+
}
|
204
|
+
|
205
|
+
# Code from http://grokbase.com/t/r/r-sig-phylo/109268tgx8/midpoint-rooting
|
206
|
+
midpoint <- function(tree){
|
207
|
+
dm = cophenetic(tree)
|
208
|
+
tree = unroot(tree)
|
209
|
+
rn = max(tree$edge)+1
|
210
|
+
maxdm = max(dm)
|
211
|
+
ind = which(dm==maxdm,arr=TRUE)[1,]
|
212
|
+
tmproot = Ancestors(tree, ind[1], "parent")
|
213
|
+
tree = phangorn:::reroot(tree, tmproot)
|
214
|
+
edge = tree$edge
|
215
|
+
el = tree$edge.length
|
216
|
+
children = tree$edge[,2]
|
217
|
+
left = match(ind[1], children)
|
218
|
+
tmp = Ancestors(tree, ind[2], "all")
|
219
|
+
tmp= c(ind[2], tmp[-length(tmp)])
|
220
|
+
right = match(tmp, children)
|
221
|
+
if(el[left]>= (maxdm/2)){
|
222
|
+
edge = rbind(edge, c(rn, ind[1]))
|
223
|
+
edge[left,2] = rn
|
224
|
+
el[left] = el[left] - (maxdm/2)
|
225
|
+
el = c(el, maxdm/2)
|
226
|
+
}else{
|
227
|
+
sel = cumsum(el[right])
|
228
|
+
i = which(sel>(maxdm/2))[1]
|
229
|
+
edge = rbind(edge, c(rn, tmp[i]))
|
230
|
+
edge[right[i],2] = rn
|
231
|
+
eltmp = sel[i] - (maxdm/2)
|
232
|
+
el = c(el, el[right[i]] - eltmp)
|
233
|
+
el[right[i]] = eltmp
|
234
|
+
}
|
235
|
+
tree$edge.length = el
|
236
|
+
tree$edge=edge
|
237
|
+
tree$Nnode = tree$Nnode+1
|
238
|
+
phangorn:::reorderPruning(phangorn:::reroot(tree, rn))
|
239
|
+
}
|
240
|
+
|
241
|
+
#= Main
|
242
|
+
subclades(ani_file=argv[1], out_base=argv[2],
|
243
|
+
thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
|
244
|
+
|