miga-base 0.2.2.1 → 0.2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -1
- data/actions/create_dataset.rb +2 -5
- data/actions/daemon.rb +1 -0
- data/actions/plugins.rb +25 -0
- data/actions/result_stats.rb +10 -0
- data/bin/miga +1 -0
- data/lib/miga/daemon.rb +12 -4
- data/lib/miga/dataset.rb +4 -3
- data/lib/miga/project.rb +38 -4
- data/lib/miga/remote_dataset.rb +2 -2
- data/lib/miga/version.rb +1 -1
- data/scripts/_distances_functions.bash +20 -20
- data/scripts/_distances_noref_nomulti.bash +20 -13
- data/scripts/_distances_ref_nomulti.bash +11 -10
- data/scripts/aai_distances.bash +15 -12
- data/scripts/ani_distances.bash +14 -11
- data/scripts/assembly.bash +2 -1
- data/scripts/cds.bash +2 -2
- data/scripts/clade_finding.bash +2 -1
- data/scripts/distances.bash +2 -2
- data/scripts/essential_genes.bash +14 -4
- data/scripts/haai_distances.bash +17 -20
- data/scripts/init.bash +1 -1
- data/scripts/miga.bash +6 -0
- data/scripts/mytaxa.bash +2 -2
- data/scripts/mytaxa_scan.bash +2 -2
- data/scripts/ogs.bash +2 -2
- data/scripts/read_quality.bash +2 -2
- data/scripts/ssu.bash +2 -2
- data/scripts/stats.bash +3 -2
- data/scripts/subclades.bash +2 -2
- data/scripts/trimmed_fasta.bash +2 -2
- data/scripts/trimmed_reads.bash +2 -2
- data/test/daemon_test.rb +1 -1
- data/test/test_helper.rb +2 -2
- data/utils/subclades-nj.R +244 -0
- data/utils/subclades-pam.R +186 -0
- data/utils/subclades.R +39 -13
- metadata +6 -3
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="essential"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -8,6 +9,16 @@ cd "$PROJECT/data/07.annotation/01.function/01.essential"
|
|
8
9
|
|
9
10
|
# Initialize
|
10
11
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
12
|
+
FAA="../../../06.cds/$DATASET.faa"
|
13
|
+
|
14
|
+
# Check if there are any proteins
|
15
|
+
if [[ ! -s $FAA ]] ; then
|
16
|
+
echo Empty protein set, bypassing essential genes
|
17
|
+
rm "$DATASET.start"
|
18
|
+
miga create_dataset -P "$PROJECT" -D $DATASET \
|
19
|
+
-m run_essential_genes=false --update
|
20
|
+
exit 0
|
21
|
+
fi
|
11
22
|
|
12
23
|
# Find and extract essential genes
|
13
24
|
[[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
|
@@ -15,16 +26,15 @@ mkdir "$DATASET.ess"
|
|
15
26
|
TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
|
16
27
|
--metadata "type" | awk '{print $2}')
|
17
28
|
if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
|
18
|
-
HMM.essential.rb -i "
|
29
|
+
HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
|
19
30
|
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
20
31
|
> "$DATASET.ess/log"
|
21
32
|
else
|
22
|
-
HMM.essential.rb -i "
|
33
|
+
HMM.essential.rb -i "$FAA" -o "$DATASET.ess.faa" \
|
23
34
|
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
|
24
35
|
> "$DATASET.ess/log"
|
25
36
|
fi
|
26
37
|
|
27
38
|
# Finalize
|
28
39
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
29
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
30
|
-
|
40
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/haai_distances.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="haai_distances"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -15,32 +16,28 @@ DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
|
|
15
16
|
# Extract values
|
16
17
|
echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
|
17
18
|
for i in $DS ; do
|
18
|
-
|
19
|
-
|
20
|
-
|
19
|
+
echo "SELECT 'hAAI', seq1, seq2, aai, sd, n, omega from aai ;" \
|
20
|
+
| sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
|
21
|
+
echo "$i" >> miga-project.log
|
21
22
|
done
|
22
23
|
|
23
24
|
# R-ify
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
h
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
"
|
37
|
-
fi
|
38
|
-
fi | R --vanilla
|
25
|
+
echo "
|
26
|
+
haai <- read.table('miga-project.txt', sep='\\t', h=T, as.is=TRUE);
|
27
|
+
save(haai, file='miga-project.Rdata');
|
28
|
+
if(sum(haai[,'a'] != haai[,'b']) > 0){
|
29
|
+
h <- hist(haai[haai[,'a'] != haai[,'b'], 'value'], breaks=100, plot=FALSE);
|
30
|
+
write.table(
|
31
|
+
cbind(h[['breaks']][-length(h[['breaks']])],
|
32
|
+
h[['breaks']][-1],h[['counts']]),
|
33
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
34
|
+
col.names=FALSE, row.names=FALSE);
|
35
|
+
}
|
36
|
+
" | R --vanilla
|
39
37
|
|
40
38
|
# Gzip
|
41
39
|
gzip -9 -f miga-project.txt
|
42
40
|
|
43
41
|
# Finalize
|
44
42
|
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
45
|
-
miga add_result -P "$PROJECT" -r
|
46
|
-
|
43
|
+
miga add_result -P "$PROJECT" -r "$SCRIPT"
|
data/scripts/init.bash
CHANGED
@@ -106,7 +106,7 @@ echo "export PATH=\$MIGA_PATH\$PATH" >> "$HOME/.miga_rc"
|
|
106
106
|
# Check for R packages
|
107
107
|
echo "
|
108
108
|
Looking for R packages:" >&2
|
109
|
-
RLIBS="enveomics.R ape ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
|
109
|
+
RLIBS="enveomics.R ape phangorn phytools ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
|
110
110
|
for lib in $RLIBS ; do
|
111
111
|
if ! check_rlib $lib ; then
|
112
112
|
echo "+ Installing $lib" >&2
|
data/scripts/miga.bash
CHANGED
@@ -3,8 +3,14 @@ set -e
|
|
3
3
|
#MIGA=${MIGA:-$(cd "$(dirname "$0")/.."; pwd)}
|
4
4
|
source "$HOME/.miga_rc"
|
5
5
|
export PATH="$MIGA/bin:$PATH"
|
6
|
+
SCRIPT=${SCRIPT:-$(basename $0 .bash)}
|
6
7
|
|
7
8
|
function exists { [[ -e "$1" ]] ; }
|
9
|
+
function fx_exists { [[ $(type -t $1) == "function" ]] ; }
|
10
|
+
|
11
|
+
for i in $(miga plugins -P "$PROJECT") ; do
|
12
|
+
source "$i/scripts-plugin.bash"
|
13
|
+
done
|
8
14
|
|
9
15
|
#if [[ "$RUNTYPE" == "qsub" ]] ; then
|
10
16
|
#elif [[ "$RUNTYPE" == "msub" ]] ; then
|
data/scripts/mytaxa.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="mytaxa"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -90,5 +91,4 @@ fi
|
|
90
91
|
|
91
92
|
# Finalize
|
92
93
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
93
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
94
|
-
|
94
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/mytaxa_scan.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="mytaxa_scan"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -82,5 +83,4 @@ fi
|
|
82
83
|
# Finalize
|
83
84
|
rm -R "$TMPDIR"
|
84
85
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
85
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
86
|
-
|
86
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/ogs.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="ogs"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -36,5 +37,4 @@ rm -rf miga-project.rbm
|
|
36
37
|
|
37
38
|
# Finalize
|
38
39
|
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
39
|
-
miga add_result -P "$PROJECT" -r
|
40
|
-
|
40
|
+
miga add_result -P "$PROJECT" -r "$SCRIPT"
|
data/scripts/read_quality.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="read_quality"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -34,5 +35,4 @@ exists ../02.trimmed_reads/$b.[12].*.pdf \
|
|
34
35
|
|
35
36
|
# Finalize
|
36
37
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
37
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
38
|
-
|
38
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/ssu.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="ssu"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -32,5 +33,4 @@ fi
|
|
32
33
|
|
33
34
|
# Finalize
|
34
35
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
35
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
36
|
-
|
36
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/stats.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES, $DATASET
|
3
3
|
set -e
|
4
|
+
SCRIPT="stats"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -12,11 +13,11 @@ cd "$DIR"
|
|
12
13
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
13
14
|
|
14
15
|
# Calculate statistics
|
15
|
-
for i in raw_reads trimmed_fasta assembly cds ; do
|
16
|
+
for i in raw_reads trimmed_fasta assembly cds essential_genes ; do
|
16
17
|
echo "# $i"
|
17
18
|
miga result_stats --compute-and-save -P "$PROJECT" -D "$DATASET" -r $i
|
18
19
|
done
|
19
20
|
|
20
21
|
# Finalize
|
21
22
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
22
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
23
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/subclades.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="subclades"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -22,5 +23,4 @@ ruby "$MIGA/utils/subclades-compile.rb" . \
|
|
22
23
|
|
23
24
|
# Finalize
|
24
25
|
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
25
|
-
miga add_result -P "$PROJECT" -r
|
26
|
-
|
26
|
+
miga add_result -P "$PROJECT" -r "$SCRIPT"
|
data/scripts/trimmed_fasta.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="trimmed_fasta"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -44,5 +45,4 @@ done
|
|
44
45
|
|
45
46
|
# Finalize
|
46
47
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
47
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
48
|
-
|
48
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/scripts/trimmed_reads.bash
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
3
|
set -e
|
4
|
+
SCRIPT="trimmed_reads"
|
4
5
|
echo "MiGA: $MIGA"
|
5
6
|
echo "Project: $PROJECT"
|
6
7
|
source "$MIGA/scripts/miga.bash" || exit 1
|
@@ -54,5 +55,4 @@ rm $b.[12].*.discard &>/dev/null
|
|
54
55
|
|
55
56
|
# Finalize
|
56
57
|
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
57
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r
|
58
|
-
|
58
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT"
|
data/test/daemon_test.rb
CHANGED
@@ -72,7 +72,7 @@ class DaemonTest < Test::Unit::TestCase
|
|
72
72
|
d.runopts(:latency, 0, true)
|
73
73
|
assert_equal(0, d.latency)
|
74
74
|
omit_if($jruby_tests, "JRuby doesn't implement fork.")
|
75
|
-
$child = fork { d.start }
|
75
|
+
$child = fork { d.start(["--shush"]) }
|
76
76
|
sleep(3)
|
77
77
|
dpath = File.expand_path("daemon/MiGA:#{p.name}",p.path)
|
78
78
|
assert(File.exist?("#{dpath}.pid"))
|
data/test/test_helper.rb
CHANGED
@@ -0,0 +1,244 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
#
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
#
|
6
|
+
|
7
|
+
#= Load stuff
|
8
|
+
argv <- commandArgs(trailingOnly=T)
|
9
|
+
suppressPackageStartupMessages(library(ape))
|
10
|
+
suppressPackageStartupMessages(library(vegan))
|
11
|
+
suppressPackageStartupMessages(library(cluster))
|
12
|
+
suppressPackageStartupMessages(library(phytools))
|
13
|
+
suppressPackageStartupMessages(library(phangorn))
|
14
|
+
suppressPackageStartupMessages(library(parallel))
|
15
|
+
suppressPackageStartupMessages(library(enveomics.R))
|
16
|
+
|
17
|
+
#= Main function
|
18
|
+
subclades <- function(ani_file, out_base, thr=1, ani=c()) {
|
19
|
+
say("==> Out base:", out_base, "<==")
|
20
|
+
|
21
|
+
# Input arguments
|
22
|
+
if(missing(ani_file)){
|
23
|
+
a <- as.data.frame(ani)
|
24
|
+
}else{
|
25
|
+
a <- read.table(gzfile(ani_file), sep="\t", header=TRUE, as.is=TRUE)
|
26
|
+
}
|
27
|
+
if(nrow(a)==0){
|
28
|
+
generate_empty_files(out_base)
|
29
|
+
return(NULL)
|
30
|
+
}
|
31
|
+
|
32
|
+
# Get ANI distances
|
33
|
+
say("Distances")
|
34
|
+
a$d <- 1-a$value/100
|
35
|
+
ani.d <- enve.df2dist(data.frame(a$a, a$b, a$d), default.d=max(a$d)*1.2)
|
36
|
+
ani.ph <- midpoint(bionj(ani.d))
|
37
|
+
express.ori <- options('expressions')$expressions
|
38
|
+
if(express.ori < ani.ph$Nnode*4){
|
39
|
+
options(expressions=min(c(5e7,ani.ph$Nnode*4)))
|
40
|
+
}
|
41
|
+
write.tree(ani.ph, paste(out_base, ".nwk", sep=""))
|
42
|
+
options(expressions=express.ori)
|
43
|
+
ani.ph$edge.length[ ani.ph$edge.length<0 ] <- 0
|
44
|
+
ani.cpd <- cophenetic(ani.ph)
|
45
|
+
# Transform phylogenetic tree for clustering
|
46
|
+
ani.hcl <- as.hclust(
|
47
|
+
# 3. Randomly split multifurcations
|
48
|
+
multi2di(
|
49
|
+
# 2. Coalescent
|
50
|
+
compute.brtime(
|
51
|
+
# 1. Collapse zero-length and negative branches
|
52
|
+
di2multi(ani.ph, tol=min(ani.ph$edge.length[ani.ph$edge.length>0])))))
|
53
|
+
|
54
|
+
# Silhouette
|
55
|
+
say("Silhouette")
|
56
|
+
k <- 2:min(length(labels(ani.d))-1, 100)
|
57
|
+
s <- sapply(k, function(x) {
|
58
|
+
library(cluster)
|
59
|
+
cl <- cutree(ani.hcl, k=x)
|
60
|
+
s <- silhouette(cl, dmatrix=ani.cpd)
|
61
|
+
c(mean(s[,'sil_width']),
|
62
|
+
-sum(ifelse(s[,'sil_width']>0,0,s[,'sil_width'])))
|
63
|
+
})
|
64
|
+
s.avg.z <- (s[1,]-mean(s[1,]))/(sd(s[1,])+0.0001)
|
65
|
+
s.neg.z <- (s[2,]-mean(s[2,]))/(sd(s[2,])+0.01)
|
66
|
+
ds <- s.avg.z - s.neg.z - 2/(1:length(k)) - (1:length(k))/50
|
67
|
+
top.n <- k[which.max(ds)]
|
68
|
+
|
69
|
+
# Classify genomes
|
70
|
+
say("Classify => k :", top.n, "| n :", length(labels(ani.d)))
|
71
|
+
ani.types <- cutree(ani.hcl, k=top.n)
|
72
|
+
ani.medoids <- sapply(unique(ani.types),
|
73
|
+
clust.medoid, as.matrix(ani.d), ani.types)
|
74
|
+
|
75
|
+
# Generate graphic report
|
76
|
+
say("Graphic report")
|
77
|
+
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
78
|
+
layout(matrix(c(1,1,2,2,3,3,4,5),byrow=TRUE, ncol=2))
|
79
|
+
plot_distances(ani.d)
|
80
|
+
plot_silhouette(k, s[1,], s[2,], ds, top.n)
|
81
|
+
plot_clustering(ani.hcl, ani.d, ani.types, ani.medoids)
|
82
|
+
plot_tree(ani.ph, ani.types, ani.medoids)
|
83
|
+
dev.off()
|
84
|
+
|
85
|
+
# Save results
|
86
|
+
say("Text report")
|
87
|
+
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
88
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
89
|
+
save(ani.d, file=paste(out_base, "dist.rdata", sep="."))
|
90
|
+
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
91
|
+
ani.d.m <- 100 - as.matrix(ani.d)*100
|
92
|
+
for(j in 1:nrow(classif)){
|
93
|
+
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
94
|
+
}
|
95
|
+
write.table(classif, paste(out_base,"classif",sep="."),
|
96
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
97
|
+
|
98
|
+
# Recursive search
|
99
|
+
say("Recursive search")
|
100
|
+
for(i in 1:top.n){
|
101
|
+
medoid <- ani.medoids[i]
|
102
|
+
ds_f <- names(ani.types)[ ani.types==i ]
|
103
|
+
say("Analyzing subclade", i, "with medoid:", medoid)
|
104
|
+
dir.create(paste(out_base, ".sc-", i, sep=""))
|
105
|
+
write.table(ds_f,
|
106
|
+
paste(out_base, ".sc-", i, "/miga-project.all",sep=""),
|
107
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
108
|
+
if(length(ds_f) > 5){
|
109
|
+
a_f <- a[ (a$a %in% ds_f) & (a$b %in% ds_f), ]
|
110
|
+
subclades(out_base=paste(out_base, ".sc-", i, "/miga-project", sep=""),
|
111
|
+
thr=thr, ani=a_f)
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
#= Helper functions
|
117
|
+
say <- function(...) { cat("[", date(), "]", ..., "\n") }
|
118
|
+
|
119
|
+
generate_empty_files <- function(out_base) {
|
120
|
+
pdf(paste(out_base, ".pdf", sep=""), 7, 12)
|
121
|
+
plot(1, t="n", axes=F)
|
122
|
+
legend("center", "No data", bty="n")
|
123
|
+
dev.off()
|
124
|
+
file.create(paste(out_base,".1.classif",sep=""))
|
125
|
+
file.create(paste(out_base,".1.medoids",sep=""))
|
126
|
+
}
|
127
|
+
|
128
|
+
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
129
|
+
# s
|
130
|
+
par(mar=c(4,5,1,5)+0.1)
|
131
|
+
plot(1, t="n", xlab="k (clusters)", ylab="", xlim=range(c(0,k)),
|
132
|
+
ylim=range(s), bty="n", xaxs="i", yaxt="n")
|
133
|
+
polygon(c(k[1], k, k[length(k)]), c(0,s,0), border=NA, col="grey80")
|
134
|
+
axis(2, fg="grey60", col.axis="grey60")
|
135
|
+
mtext("Mean silhouette", side=2, line=3, col="grey60")
|
136
|
+
# ns
|
137
|
+
par(new=TRUE)
|
138
|
+
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
139
|
+
ylim=range(ns), bty="n", xaxs="i")
|
140
|
+
points(k, ns, type="o", pch=16, col=rgb(1/2,0,0,3/4))
|
141
|
+
axis(4, fg="darkred", col.axis="darkred")
|
142
|
+
mtext("Negative silhouette area", side=4, line=3, col="darkred")
|
143
|
+
# ds
|
144
|
+
par(new=TRUE)
|
145
|
+
plot(1, t="n", xlab="", xaxt="n", ylab="", yaxt="n", xlim=range(c(0,k)),
|
146
|
+
ylim=range(ds), bty="n", xaxs="i")
|
147
|
+
lines(k, ds)
|
148
|
+
abline(v=top.n, lty=2)
|
149
|
+
}
|
150
|
+
|
151
|
+
plot_distances <- function(dist) {
|
152
|
+
par(mar=c(5,4,1,2)+0.1)
|
153
|
+
hist(dist, border=NA, col="grey60", breaks=50, xlab="Distances", main="")
|
154
|
+
}
|
155
|
+
|
156
|
+
plot_clustering <- function(hcl, dist, types, medoids) {
|
157
|
+
par(mar=c(5,4,4,2)+0.1)
|
158
|
+
top.n <- length(medoids)
|
159
|
+
col <- ggplotColours(top.n)
|
160
|
+
plot(silhouette(types, dist=dist), col=col)
|
161
|
+
if(length(labels(dist))<=15){
|
162
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
163
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
164
|
+
}else{
|
165
|
+
ani.mds <- cmdscale(dist, k=4)
|
166
|
+
if(ncol(ani.mds)==4){
|
167
|
+
plot(ani.mds[,1], ani.mds[,2], col=col[types], cex=1/2,
|
168
|
+
xlab='Component 1', ylab='Component 2')
|
169
|
+
plot(ani.mds[,3], ani.mds[,4], col=col[types], cex=1/2,
|
170
|
+
xlab='Component 3', ylab='Component 4')
|
171
|
+
}else{
|
172
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
173
|
+
plot(1, type="n", axes=FALSE, xlab="", ylab="", bty="n")
|
174
|
+
}
|
175
|
+
}
|
176
|
+
}
|
177
|
+
|
178
|
+
plot_tree <- function(phy, types, medoids){
|
179
|
+
layout(1)
|
180
|
+
top.n <- length(unique(types))
|
181
|
+
col <- ggplotColours(top.n)
|
182
|
+
is.medoid <- phy$tip.label %in% medoids
|
183
|
+
phy$tip.label[is.medoid] <- paste(phy$tip.label[is.medoid],
|
184
|
+
" [", types[phy$tip.label[is.medoid]], "]", sep='')
|
185
|
+
plot(phy, cex=ifelse(is.medoid, 1/3, 1/6),
|
186
|
+
font=ifelse(is.medoid, 2, 1),
|
187
|
+
tip.color=col[types[phy$tip.label]])
|
188
|
+
}
|
189
|
+
|
190
|
+
ggplotColours <- function(n=6, h=c(0, 360)+15, alpha=1){
|
191
|
+
if ((diff(h)%%360) < 1) h[2] <- h[2] - 360/n
|
192
|
+
hcl(h=seq(h[1], h[2], length=n), c=100, l=65, alpha=alpha)
|
193
|
+
}
|
194
|
+
|
195
|
+
# Modified from https://www.biostars.org/p/11987/
|
196
|
+
clust.medoid <- function(i, distmat, clusters) {
|
197
|
+
ind <- (clusters == i)
|
198
|
+
if (sum(ind) <= 1){
|
199
|
+
return (rownames(distmat)[ind])
|
200
|
+
} else {
|
201
|
+
return(names(which.min(rowSums( distmat[ind, ind] ))))
|
202
|
+
}
|
203
|
+
}
|
204
|
+
|
205
|
+
# Code from http://grokbase.com/t/r/r-sig-phylo/109268tgx8/midpoint-rooting
|
206
|
+
midpoint <- function(tree){
|
207
|
+
dm = cophenetic(tree)
|
208
|
+
tree = unroot(tree)
|
209
|
+
rn = max(tree$edge)+1
|
210
|
+
maxdm = max(dm)
|
211
|
+
ind = which(dm==maxdm,arr=TRUE)[1,]
|
212
|
+
tmproot = Ancestors(tree, ind[1], "parent")
|
213
|
+
tree = phangorn:::reroot(tree, tmproot)
|
214
|
+
edge = tree$edge
|
215
|
+
el = tree$edge.length
|
216
|
+
children = tree$edge[,2]
|
217
|
+
left = match(ind[1], children)
|
218
|
+
tmp = Ancestors(tree, ind[2], "all")
|
219
|
+
tmp= c(ind[2], tmp[-length(tmp)])
|
220
|
+
right = match(tmp, children)
|
221
|
+
if(el[left]>= (maxdm/2)){
|
222
|
+
edge = rbind(edge, c(rn, ind[1]))
|
223
|
+
edge[left,2] = rn
|
224
|
+
el[left] = el[left] - (maxdm/2)
|
225
|
+
el = c(el, maxdm/2)
|
226
|
+
}else{
|
227
|
+
sel = cumsum(el[right])
|
228
|
+
i = which(sel>(maxdm/2))[1]
|
229
|
+
edge = rbind(edge, c(rn, tmp[i]))
|
230
|
+
edge[right[i],2] = rn
|
231
|
+
eltmp = sel[i] - (maxdm/2)
|
232
|
+
el = c(el, el[right[i]] - eltmp)
|
233
|
+
el[right[i]] = eltmp
|
234
|
+
}
|
235
|
+
tree$edge.length = el
|
236
|
+
tree$edge=edge
|
237
|
+
tree$Nnode = tree$Nnode+1
|
238
|
+
phangorn:::reorderPruning(phangorn:::reroot(tree, rn))
|
239
|
+
}
|
240
|
+
|
241
|
+
#= Main
|
242
|
+
subclades(ani_file=argv[1], out_base=argv[2],
|
243
|
+
thr=ifelse(is.na(argv[3]), 1, as.numeric(argv[3])))
|
244
|
+
|