miga-base 0.2.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +351 -0
- data/actions/add_result +61 -0
- data/actions/add_taxonomy +86 -0
- data/actions/create_dataset +62 -0
- data/actions/create_project +70 -0
- data/actions/daemon +69 -0
- data/actions/download_dataset +77 -0
- data/actions/find_datasets +63 -0
- data/actions/import_datasets +86 -0
- data/actions/index_taxonomy +71 -0
- data/actions/list_datasets +83 -0
- data/actions/list_files +67 -0
- data/actions/unlink_dataset +52 -0
- data/bin/miga +48 -0
- data/lib/miga/daemon.rb +178 -0
- data/lib/miga/dataset.rb +286 -0
- data/lib/miga/gui.rb +289 -0
- data/lib/miga/metadata.rb +74 -0
- data/lib/miga/project.rb +268 -0
- data/lib/miga/remote_dataset.rb +154 -0
- data/lib/miga/result.rb +102 -0
- data/lib/miga/tax_index.rb +70 -0
- data/lib/miga/taxonomy.rb +107 -0
- data/lib/miga.rb +83 -0
- data/scripts/_distances_noref_nomulti.bash +86 -0
- data/scripts/_distances_ref_nomulti.bash +105 -0
- data/scripts/aai_distances.bash +40 -0
- data/scripts/ani_distances.bash +39 -0
- data/scripts/assembly.bash +38 -0
- data/scripts/cds.bash +45 -0
- data/scripts/clade_finding.bash +27 -0
- data/scripts/distances.bash +30 -0
- data/scripts/essential_genes.bash +29 -0
- data/scripts/haai_distances.bash +39 -0
- data/scripts/init.bash +211 -0
- data/scripts/miga.bash +12 -0
- data/scripts/mytaxa.bash +93 -0
- data/scripts/mytaxa_scan.bash +85 -0
- data/scripts/ogs.bash +36 -0
- data/scripts/read_quality.bash +37 -0
- data/scripts/ssu.bash +35 -0
- data/scripts/subclades.bash +26 -0
- data/scripts/trimmed_fasta.bash +47 -0
- data/scripts/trimmed_reads.bash +57 -0
- data/utils/adapters.fa +302 -0
- data/utils/mytaxa_scan.R +89 -0
- data/utils/mytaxa_scan.rb +58 -0
- data/utils/requirements.txt +19 -0
- data/utils/subclades-compile.rb +48 -0
- data/utils/subclades.R +171 -0
- metadata +185 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $DATASET, $RUNTYPE, $MIGA, $CORES, $TMPDIR,
|
3
|
+
# $NOMULTI, $REF
|
4
|
+
|
5
|
+
function checkpoint_n {
|
6
|
+
if [[ $N -eq 10 ]] ; then
|
7
|
+
for t in 01.haai 02.aai 03.ani ; do
|
8
|
+
if [[ -s $TMPDIR/$t.db ]] ; then
|
9
|
+
tab="aai"
|
10
|
+
[[ "$t" == "03.ani" ]] && tab="ani"
|
11
|
+
echo "select count(*) from $tab;" \
|
12
|
+
| sqlite3 $TMPDIR/$t.db\
|
13
|
+
|| exit 1
|
14
|
+
cp $TMPDIR/$t.db $t/$DATASET.db
|
15
|
+
fi
|
16
|
+
done
|
17
|
+
N=0
|
18
|
+
fi
|
19
|
+
let N=$N+1
|
20
|
+
}
|
21
|
+
|
22
|
+
ESS="../07.annotation/01.function/01.essential"
|
23
|
+
|
24
|
+
# Initialize temporals
|
25
|
+
for t in 01.haai 02.aai 03.ani ; do
|
26
|
+
[[ -s $t/$DATASET.db ]] && cp $t/$DATASET.db $TMPDIR/$t.db
|
27
|
+
done
|
28
|
+
echo "create table if not exists aai(seq1 varchar(256), seq2 varchar(256)," \
|
29
|
+
"aai float, sd float, n int, omega int);" | sqlite3 $TMPDIR/02.aai.db
|
30
|
+
N=1
|
31
|
+
|
32
|
+
# Traverse "nearly-half" of the ref-datasets using first-come-first-served
|
33
|
+
for i in $(miga list_datasets -P "$PROJECT" --ref --no-multi) ; do
|
34
|
+
echo "=[ $i ]"
|
35
|
+
date "+%Y-%m-%d %H:%M:%S %z"
|
36
|
+
HAAI=""; AAI=""; ANI="";
|
37
|
+
# Check if the i-th dataset is ready
|
38
|
+
[[ -s $ESS/$i.done && -s $ESS/$i.json ]] || continue
|
39
|
+
# Check if this is done (e.g., in a previous failed iteration)
|
40
|
+
AAI=$(echo "select aai from aai where seq1='$DATASET' and seq2='$i';" \
|
41
|
+
| sqlite3 $TMPDIR/02.aai.db || echo "")
|
42
|
+
# Try the other direction
|
43
|
+
if [[ "$AAI" == "" && -s 02.aai/$i.db ]] ; then
|
44
|
+
cp "02.aai/$i.db" "$TMPDIR/$i.db"
|
45
|
+
AAI=$(echo "select aai from aai where seq2='$DATASET' and seq1='$i';" \
|
46
|
+
| sqlite3 "$TMPDIR/$i.db" || echo "")
|
47
|
+
rm "$TMPDIR/$i.db"
|
48
|
+
fi
|
49
|
+
# Try with hAAI
|
50
|
+
if [[ "$AAI" == "" ]] ; then
|
51
|
+
[[ -e "$TMPDIR/$DATASET.ess.faa" ]] \
|
52
|
+
|| cp $ESS/$DATASET.ess.faa $TMPDIR/$DATASET.ess.faa
|
53
|
+
HAAI=$(aai.rb -1 $TMPDIR/$DATASET.ess.faa -2 $ESS/$i.ess.faa \
|
54
|
+
-t $CORES -a -n 10 -S $TMPDIR/01.haai.db --name1 $DATASET \
|
55
|
+
--name2 $i --lookup-first --no-save-rbm || echo "")
|
56
|
+
if [[ "$HAAI" != "" \
|
57
|
+
&& $(perl -MPOSIX -e "print floor $HAAI") -lt 90 ]] ; then
|
58
|
+
AAI=$(perl -e \
|
59
|
+
"printf '%f', 100-exp(2.435076 + 0.4275193*log(100-$HAAI))")
|
60
|
+
echo "insert into aai values('$DATASET','$i','$AAI',0,0,0);" \
|
61
|
+
| sqlite3 $TMPDIR/02.aai.db
|
62
|
+
fi
|
63
|
+
fi
|
64
|
+
# Try with complete AAI
|
65
|
+
if [[ "$AAI" == "" ]] ; then
|
66
|
+
[[ -e "$TMPDIR/$DATASET.faa" ]] \
|
67
|
+
|| cp ../06.cds/$DATASET.faa $TMPDIR/$DATASET.faa
|
68
|
+
AAI=$(aai.rb -1 $TMPDIR/$DATASET.faa -2 ../06.cds/$i.faa -t $CORES -a \
|
69
|
+
-S $TMPDIR/02.aai.db --name1 $DATASET --name2 $i --lookup-first \
|
70
|
+
|| echo "")
|
71
|
+
fi
|
72
|
+
date "+%Y-%m-%d %H:%M:%S %z"
|
73
|
+
# Check if ANI is meaningful
|
74
|
+
if [[ -e "../05.assembly/$DATASET.LargeContigs.fna" \
|
75
|
+
&& -e "../05.assembly/$i.LargeContigs.fna" \
|
76
|
+
&& $(perl -MPOSIX -e "print ceil $AAI") -gt 90 ]] ; then
|
77
|
+
# Check if this is done (e.g., in a previous failed iteration)
|
78
|
+
ANI=$(echo "select ani from ani where seq1='$DATASET' and seq2='$i';" \
|
79
|
+
| sqlite3 $TMPDIR/03.ani.db || echo "")
|
80
|
+
# Try the other direction
|
81
|
+
if [[ "$ANI" == "" && -s 03.ani/$i.db ]] ; then
|
82
|
+
cp "03.ani/$i.db" "$TMPDIR/$i.db"
|
83
|
+
ANI=$(echo "select ani from ani" \
|
84
|
+
"where seq2='$DATASET' and seq1='$i';" \
|
85
|
+
| sqlite3 "$TMPDIR/$i.db" || echo "")
|
86
|
+
rm "$TMPDIR/$i.db"
|
87
|
+
fi
|
88
|
+
# Calculate it
|
89
|
+
if [[ "$ANI" == "" ]] ; then
|
90
|
+
[[ -e "$TMPDIR/$DATASET.LargeContigs.fna" ]] \
|
91
|
+
|| cp ../05.assembly/$DATASET.LargeContigs.fna \
|
92
|
+
$TMPDIR/$DATASET.LargeContigs.fna
|
93
|
+
ANI=$(ani.rb -1 $TMPDIR/$DATASET.LargeContigs.fna \
|
94
|
+
-2 ../05.assembly/$i.LargeContigs.fna -t $CORES \
|
95
|
+
-S $TMPDIR/03.ani.db -a --name1 $DATASET --name2 $i \
|
96
|
+
--no-save-regions --no-save-rbm --lookup-first \
|
97
|
+
|| echo "")
|
98
|
+
fi
|
99
|
+
fi
|
100
|
+
echo "$AAI;$ANI"
|
101
|
+
checkpoint_n
|
102
|
+
done
|
103
|
+
N=10
|
104
|
+
checkpoint_n
|
105
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/09.distances/02.aai"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
|
10
|
+
|
11
|
+
echo -n "" > miga-project.log
|
12
|
+
DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
|
13
|
+
|
14
|
+
# Extract values
|
15
|
+
echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
|
16
|
+
for i in $DS ; do
|
17
|
+
echo "SELECT CASE WHEN omega!=0 THEN 'AAI' ELSE 'hAAI_AAI' END," \
|
18
|
+
" seq1, seq2, aai, sd, n, omega from aai;" \
|
19
|
+
| sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
|
20
|
+
echo "$i" >> miga-project.log
|
21
|
+
done
|
22
|
+
|
23
|
+
# R-ify
|
24
|
+
echo "
|
25
|
+
aai <- read.table('miga-project.txt', sep='\\t', h=T);
|
26
|
+
save(aai, file='miga-project.Rdata');
|
27
|
+
h <- hist(aai[,'value'], breaks=100, plot=FALSE);
|
28
|
+
write.table(
|
29
|
+
cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
|
30
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
31
|
+
col.names=FALSE, row.names=FALSE);
|
32
|
+
" | R --vanilla
|
33
|
+
|
34
|
+
# Gzip
|
35
|
+
gzip -9 -f miga-project.txt
|
36
|
+
|
37
|
+
# Finalize
|
38
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
39
|
+
miga add_result -P "$PROJECT" -r aai_distances
|
40
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/09.distances/03.ani"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
|
10
|
+
|
11
|
+
echo -n "" > miga-project.log
|
12
|
+
DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
|
13
|
+
|
14
|
+
# Extract values
|
15
|
+
echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
|
16
|
+
for i in $DS ; do
|
17
|
+
echo "SELECT 'ANI', seq1, seq2, ani, sd, n, omega from ani ;" \
|
18
|
+
| sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
|
19
|
+
echo "$i" >> miga-project.log
|
20
|
+
done
|
21
|
+
|
22
|
+
# R-ify
|
23
|
+
echo "
|
24
|
+
ani <- read.table('miga-project.txt', sep='\\t', h=T);
|
25
|
+
save(ani, file='miga-project.Rdata');
|
26
|
+
h <- hist(ani[,'value'], breaks=100, plot=FALSE);
|
27
|
+
write.table(
|
28
|
+
cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
|
29
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
30
|
+
col.names=FALSE, row.names=FALSE);
|
31
|
+
" | R --vanilla
|
32
|
+
|
33
|
+
# Gzip
|
34
|
+
gzip -9 -f miga-project.txt
|
35
|
+
|
36
|
+
# Finalize
|
37
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
38
|
+
miga add_result -P "$PROJECT" -r ani_distances
|
39
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/05.assembly"
|
7
|
+
|
8
|
+
b=$DATASET
|
9
|
+
|
10
|
+
# Initialize
|
11
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
12
|
+
|
13
|
+
# Assemble
|
14
|
+
FA="../04.trimmed_fasta/$DATASET.CoupledReads.fa"
|
15
|
+
[[ -e $FA ]] || FA="$FA.gz"
|
16
|
+
[[ -e $FA ]] || FA="../04.trimmed_fasta/$DATASET.SingleReads.fa"
|
17
|
+
[[ -e $FA ]] || FA="$FA.gz"
|
18
|
+
idba_ud --pre_correction -r "$FA" -o "$DATASET" --num_threads "$CORES"
|
19
|
+
|
20
|
+
# Clean
|
21
|
+
cd $DATASET
|
22
|
+
rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa
|
23
|
+
cd ..
|
24
|
+
|
25
|
+
# Extract
|
26
|
+
if [[ -s $DATASET/scaffold.fa ]] ; then
|
27
|
+
ln -s $DATASET/scaffold.fa $DATASET.AllContigs.fna
|
28
|
+
else
|
29
|
+
ln -s $DATASET/contig.fa $DATASET.AllContigs.fna
|
30
|
+
fi
|
31
|
+
FastA.length.pl $DATASET.AllContigs.fna | awk '$2>=1000{print $1}' \
|
32
|
+
| FastA.filter.pl /dev/stdin $DATASET.AllContigs.fna \
|
33
|
+
> $DATASET.LargeContigs.fna
|
34
|
+
|
35
|
+
# Finalize
|
36
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
37
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r assembly
|
38
|
+
|
data/scripts/cds.bash
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/06.cds"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
10
|
+
GM=$(dirname -- $(which gmhmmp))
|
11
|
+
|
12
|
+
# Register key
|
13
|
+
if [[ ! -e .gm_key ]] ; then
|
14
|
+
if [[ -e "$GM/gm_key" ]] ; then
|
15
|
+
cp "$GM/gm_key" ".gm_key"
|
16
|
+
elif [[ -e "$GM/gm_key_64" ]] ; then
|
17
|
+
cp "$GM/gm_key_64" ".gm_key"
|
18
|
+
elif [[ -e "$GM/gm_key_32" ]] ; then
|
19
|
+
cp "$GM/gm_key_32" ".gm_key"
|
20
|
+
elif [[ -e "$GM/.gm_key" ]] ; then
|
21
|
+
cp "$GM/.gm_key" ".gm_key"
|
22
|
+
elif [[ -e "$HOME/.gm_key" ]] ; then
|
23
|
+
cp "$HOME/.gm_key" .
|
24
|
+
else
|
25
|
+
echo "Impossible to find MetaGeneMark key, please register your copy" \
|
26
|
+
"and place the key in '$GM/gm_key'." >&2
|
27
|
+
exit 1
|
28
|
+
fi
|
29
|
+
fi
|
30
|
+
|
31
|
+
# Run MetaGeneMark
|
32
|
+
gmhmmp -a -d -m "$GM/MetaGeneMark_v1.mod" -f G -o "$DATASET.gff2" \
|
33
|
+
"../05.assembly/$DATASET.LargeContigs.fna"
|
34
|
+
|
35
|
+
# Extract
|
36
|
+
perl "$GM/aa_from_gff.pl" < "$DATASET.gff2" > "$DATASET.faa"
|
37
|
+
perl "$GM/nt_from_gff.pl" < "$DATASET.gff2" > "$DATASET.fna"
|
38
|
+
|
39
|
+
# Gzip
|
40
|
+
gzip -9 -f "$DATASET.gff2"
|
41
|
+
|
42
|
+
# Finalize
|
43
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
44
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r cds
|
45
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/10.clades/01.find"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
|
10
|
+
|
11
|
+
# Markov-cluster genomes by ANI
|
12
|
+
gunzip -c ../../09.distances/03.ani/miga-project.txt.gz | tail -n+2 \
|
13
|
+
| awk -F"\\t" '{print $2"'"\\t"'"$3"'"\\t"'"$4}' > genome-genome.aai90.rbm
|
14
|
+
ogs.mcl.rb -d . -o miga-project.ani-clades -t "$CORES" -i \
|
15
|
+
-f "(\\S+)-(\\S+)\\.aai90\\.rbm"
|
16
|
+
cat genome-genome.aai90.rbm | awk -F"\\t" '$3>=95' > genome-genome.ani95.rbm
|
17
|
+
ogs.mcl.rb -d . -o miga-project.ani95-clades -t "$CORES" -b \
|
18
|
+
-f "(\\S+)-(\\S+)\\.ani95\\.rbm"
|
19
|
+
|
20
|
+
# Propose clade projects
|
21
|
+
cat miga-project.ani95-clades | tail -n +2 | tr "," "\\t" | awk 'NF >= 5' \
|
22
|
+
> miga-project.proposed-clades
|
23
|
+
|
24
|
+
# Finalize
|
25
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
26
|
+
miga add_result -P "$PROJECT" -r clade_finding
|
27
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $DATASET, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/09.distances"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
10
|
+
TMPDIR=$(mktemp -d /tmp/MiGA.XXXXXXXXXXXX)
|
11
|
+
trap "rm -rf $TMPDIR; exit" SIGHUP SIGINT SIGTERM
|
12
|
+
|
13
|
+
# Check type of dataset
|
14
|
+
NOMULTI=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --no-multi \
|
15
|
+
| wc -l | awk '{print $1}')
|
16
|
+
REF=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --ref \
|
17
|
+
| wc -l | awk '{print $1}')
|
18
|
+
|
19
|
+
# Call submodule
|
20
|
+
if [[ "$NOMULTI" -eq "1" && "$REF" -eq "1" ]] ; then
|
21
|
+
source "$MIGA/scripts/_distances_ref_nomulti.bash"
|
22
|
+
elif [[ "$NOMULTI" -eq "1" ]] ; then
|
23
|
+
source "$MIGA/scripts/_distances_noref_nomulti.bash"
|
24
|
+
fi
|
25
|
+
|
26
|
+
# Finalize
|
27
|
+
rm -R $TMPDIR
|
28
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
29
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r distances
|
30
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/07.annotation/01.function/01.essential"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
10
|
+
|
11
|
+
# Find and extract essential genes
|
12
|
+
[[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
|
13
|
+
mkdir "$DATASET.ess"
|
14
|
+
TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
|
15
|
+
--metadata "type" | awk '{print $2}')
|
16
|
+
if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
|
17
|
+
HMM.essential.rb -i "../../../06.cds/$DATASET.faa" -o "$DATASET.ess.faa" \
|
18
|
+
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
19
|
+
> "$DATASET.ess/log"
|
20
|
+
else
|
21
|
+
HMM.essential.rb -i "../../../06.cds/$DATASET.faa" -o "$DATASET.ess.faa" \
|
22
|
+
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
|
23
|
+
> "$DATASET.ess/log"
|
24
|
+
fi
|
25
|
+
|
26
|
+
# Finalize
|
27
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
28
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r essential
|
29
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/09.distances/01.haai"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
|
10
|
+
|
11
|
+
echo -n "" > miga-project.log
|
12
|
+
DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
|
13
|
+
|
14
|
+
# Extract values
|
15
|
+
echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
|
16
|
+
for i in $DS ; do
|
17
|
+
echo "SELECT 'hAAI', seq1, seq2, aai, sd, n, omega from aai ;" \
|
18
|
+
| sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
|
19
|
+
echo "$i" >> miga-project.log
|
20
|
+
done
|
21
|
+
|
22
|
+
# R-ify
|
23
|
+
echo "
|
24
|
+
haai <- read.table('miga-project.txt', sep='\\t', h=T);
|
25
|
+
save(haai, file='miga-project.Rdata');
|
26
|
+
h <- hist(haai[,'value'], breaks=100, plot=FALSE);
|
27
|
+
write.table(
|
28
|
+
cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
|
29
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
30
|
+
col.names=FALSE, row.names=FALSE);
|
31
|
+
" | R --vanilla
|
32
|
+
|
33
|
+
# Gzip
|
34
|
+
gzip -9 -f miga-project.txt
|
35
|
+
|
36
|
+
# Finalize
|
37
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
38
|
+
miga add_result -P "$PROJECT" -r haai_distances
|
39
|
+
|
data/scripts/init.bash
ADDED
@@ -0,0 +1,211 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
set -e
|
3
|
+
|
4
|
+
#=======[ Functions ]
|
5
|
+
function ask_user {
|
6
|
+
local question=$1
|
7
|
+
local default=$2
|
8
|
+
echo $question >&2
|
9
|
+
echo -n " [$default] > " >&2
|
10
|
+
read user_answer
|
11
|
+
user_answer=${user_answer:-$default}
|
12
|
+
echo -n "$user_answer"
|
13
|
+
}
|
14
|
+
|
15
|
+
function check_req {
|
16
|
+
local bin=$1
|
17
|
+
local default=$(dirname "$(which "$bin")")
|
18
|
+
user_answer=$(ask_user "Where can I find $2 ($3)? $4" "$default")
|
19
|
+
if [[ -x "$user_answer/$bin" ]] ; then
|
20
|
+
export PATH="$PATH:$user_answer"
|
21
|
+
echo "MIGA_PATH=\"$user_answer:\$MIGA_PATH\" # $2" >> "$HOME/.miga_rc"
|
22
|
+
else
|
23
|
+
echo "Cannot find $2 at '$user_answer/$bin'. Aborting..." >&2
|
24
|
+
exit 1
|
25
|
+
fi
|
26
|
+
}
|
27
|
+
|
28
|
+
function check_rlib {
|
29
|
+
local rlib=$1
|
30
|
+
gotit=$(echo "if(require($rlib)) cat('GOT','IT')" | R --vanilla -q 2>&1 \
|
31
|
+
| grep "GOT IT" | wc -l | awk '{print $1}')
|
32
|
+
[[ "$gotit" == "1" ]]
|
33
|
+
}
|
34
|
+
|
35
|
+
function check_gem {
|
36
|
+
local gem=$1
|
37
|
+
gotit=$(echo "require '$gem'" | ruby 2>/dev/null && echo 1)
|
38
|
+
[[ "$gotit" == "1" ]]
|
39
|
+
}
|
40
|
+
|
41
|
+
#=======[ Main ]
|
42
|
+
MIGA_STARTUP="no"
|
43
|
+
MIGA=$(cd "$(dirname "$0")/.."; pwd)
|
44
|
+
echo "
|
45
|
+
===[ Welcome to MiGA, the Microbial Genome Atlas ]===
|
46
|
+
|
47
|
+
I'm the initialization script, and I'll sniff around your computer to
|
48
|
+
make sure you have all the requirements for MiGA Daemons.
|
49
|
+
" >&2
|
50
|
+
|
51
|
+
if [[ "$(ask_user "Would you like to see all the requirements before starting? (yes / no)" "no")" == "yes" ]] ; then
|
52
|
+
echo "" >&2
|
53
|
+
cat "$MIGA/utils/requirements.txt" >&2
|
54
|
+
echo "" >&2
|
55
|
+
fi
|
56
|
+
|
57
|
+
if [[ -e "$HOME/.miga_rc" ]] ; then
|
58
|
+
case "$(ask_user "I found a previous configuration. Do you want to load the defaults within? (yes / no / cancel)" "yes")" in
|
59
|
+
yes)
|
60
|
+
source "$HOME/.miga_rc"
|
61
|
+
if [[ "$MIGA_CONFIG_DATE" == "" ]] ; then
|
62
|
+
echo " Loaded incomplete configuration" >&2
|
63
|
+
else
|
64
|
+
echo " Loaded configuration from $MIGA_CONFIG_DATE" >&2
|
65
|
+
fi
|
66
|
+
;;
|
67
|
+
no)
|
68
|
+
rm "$HOME/.miga_rc"
|
69
|
+
;;
|
70
|
+
cancel)
|
71
|
+
exit 0
|
72
|
+
;;
|
73
|
+
*)
|
74
|
+
echo "Cannot understand your answer, please use 'yes', 'no', or 'cancel'. Aborting..." >&2
|
75
|
+
exit 1
|
76
|
+
esac
|
77
|
+
fi
|
78
|
+
|
79
|
+
echo "#!/bin/bash
|
80
|
+
# MiGA made this on $(date)
|
81
|
+
" > "$HOME/.miga_rc"
|
82
|
+
|
83
|
+
# Check Software requirements
|
84
|
+
MIGA_STARTUP=$(ask_user "Is there a script I need to load at startup? (no / path to the script to load)" "$MIGA_STARTUP")
|
85
|
+
if [[ "$MIGA_STARTUP" != "no" ]] ; then
|
86
|
+
echo "MIGA_STARTUP='$MIGA_STARTUP'
|
87
|
+
source \"\$MIGA_STARTUP\"
|
88
|
+
" >> "$HOME/.miga_rc";
|
89
|
+
source "$MIGA_STARTUP";
|
90
|
+
fi
|
91
|
+
echo "
|
92
|
+
Looking for Software requirements:" >&2
|
93
|
+
reqs=$(tail -n+3 "$MIGA/utils/requirements.txt" | perl -pe 's/\t+/\t/g')
|
94
|
+
IFS_BU=$IFS
|
95
|
+
IFS=$'\n'
|
96
|
+
for ln in $reqs ; do
|
97
|
+
rname=$(echo "$ln" | awk -F'\t' '{print $1}')
|
98
|
+
rtest=$(echo "$ln" | awk -F'\t' '{print $2}')
|
99
|
+
rwebs=$(echo "$ln" | awk -F'\t' '{print $3}')
|
100
|
+
rhint=$(echo "$ln" | awk -F'\t' '{print $4}')
|
101
|
+
check_req "$rtest" "$rname" "$rwebs" "$rhint"
|
102
|
+
done
|
103
|
+
IFS=$IFS_BU
|
104
|
+
echo "export PATH=\$MIGA_PATH\$PATH" >> "$HOME/.miga_rc"
|
105
|
+
|
106
|
+
# Check for R packages
|
107
|
+
echo "
|
108
|
+
Looking for R packages:" >&2
|
109
|
+
if ! check_rlib enveomics.R ; then
|
110
|
+
echo "+ Installing enveomics.R" >&2
|
111
|
+
R CMD INSTALL $(dirname "$(which "FastQ.tag.rb")")/../enveomics.R
|
112
|
+
fi
|
113
|
+
RLIBS="ape ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
|
114
|
+
for lib in $RLIBS ; do
|
115
|
+
if ! check_rlib $lib ; then
|
116
|
+
echo "+ Installing $lib" >&2
|
117
|
+
echo "install.packages('$lib', repos='http://cran.rstudio.com/')" \
|
118
|
+
| R --vanilla -q
|
119
|
+
fi
|
120
|
+
done
|
121
|
+
|
122
|
+
# Check for ruby gems
|
123
|
+
echo "
|
124
|
+
Looking for Ruby gems:" >&2
|
125
|
+
GEMS="rest_client sqlite3 daemons json"
|
126
|
+
for gem in $GEMS ; do
|
127
|
+
if ! check_gem $gem ; then
|
128
|
+
echo "+ Installing $gem (user-only)" >&2
|
129
|
+
gem install --user $gem
|
130
|
+
fi
|
131
|
+
done
|
132
|
+
|
133
|
+
# Check for other files
|
134
|
+
echo "
|
135
|
+
Looking for additional files:
|
136
|
+
+ MetaGeneMark license key" >&2
|
137
|
+
GM=$(dirname -- $(which gmhmmp))
|
138
|
+
if [[ ! -e "$GM/gm_key" && ! -e "$GM/gm_key_64" && ! -e "$GM/gm_key_32" && ! -e "$GM/.gm_key" && ! -e "$HOME/.gm_key" ]] ; then
|
139
|
+
echo "Cannot find it, please place your license key in '$GM/gm_key'. Aborting..." >&2
|
140
|
+
exit 1
|
141
|
+
fi
|
142
|
+
echo "+ MetaGeneMark scripts" >&2
|
143
|
+
if [[ ! -e "$GM/aa_from_gff.pl" || ! -e "$GM/nt_from_gff.pl" ]] ; then
|
144
|
+
echo "Cannot find it, please place aa_from_gff.pl and nt_from_gff.pl in '$GM/'. Aborting..." >&2
|
145
|
+
exit 1
|
146
|
+
fi
|
147
|
+
echo "+ MyTaxa scores database" >&2
|
148
|
+
MT=$(dirname -- $(which MyTaxa))
|
149
|
+
if [[ ! -d "$MT/db" ]] ; then
|
150
|
+
echo "Cannot find it, please execute 'python $MT/utils/download_db.py'. Aborting..." >&2
|
151
|
+
exit 1
|
152
|
+
fi
|
153
|
+
echo "+ MyTaxa DIAMOND database" >&2
|
154
|
+
if [[ ! -e "$MT/AllGenomes.faa.dmnd" ]] ; then
|
155
|
+
echo "Cannot find it, please download 'http://enve-omics.ce.gatech.edu/data/public_mytaxa/AllGenomes.faa.dmnd' into '$MT'. Aborting..." >&2
|
156
|
+
exit 1
|
157
|
+
fi
|
158
|
+
|
159
|
+
# Configure daemon
|
160
|
+
echo "
|
161
|
+
Default daemon configuration:" >&2
|
162
|
+
dtype=$(ask_user "Please select the type of daemon you want to setup (bash / qsub / msub)", "bash")
|
163
|
+
case "$dtype" in
|
164
|
+
bash)
|
165
|
+
dlatency=$(ask_user "For how long should I sleep? (# in seconds)" "30")
|
166
|
+
dmaxjobs=$(ask_user "How many jobs can I launch at once?" "6")
|
167
|
+
dppn=$(ask_user "How many CPUs can I use per job?" "2")
|
168
|
+
echo "Setting up internal daemon defaults, if you don't understand this just leave default values:" >&2
|
169
|
+
dcmd=$(ask_user "How should I launch tasks? Use %1\$s for script path, %2\$s for variables, %3\$d for CPUs, %4\$s for log file, and %5\$s for task name." "%2\$s '%1\$s' &> '%4\$s'")
|
170
|
+
dvar=$(ask_user "How should I pass variables? Use %1\$s for keys and %2\$s for values." "%1\$s=%2\$s")
|
171
|
+
dsep=$(ask_user "What should I use to separate variables?" " ")
|
172
|
+
dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for PID, output should be 1 for running and 0 for non-running." "ps -p '%1\$s'|tail -n+2|wc -l|awk '{print \$1}'")
|
173
|
+
;;
|
174
|
+
[qm]sub)
|
175
|
+
dqueue=$(ask_user "What's the name of the queue I should use?" "")
|
176
|
+
dlatency=$(ask_user "How long should I sleep? (# in seconds)" "150")
|
177
|
+
dmaxjobs=$(ask_user "How many jobs can I launch at once?" "300")
|
178
|
+
dppn=$(ask_user "How many CPUs can I use per job?" "4")
|
179
|
+
echo "Setting up internal daemon defaults, if you don't understand this just leave default values:" >&2
|
180
|
+
dcmd=$(ask_user "How should I launch tasks? Use %1\$s for script path, %2\$s for variables, and %3\$d for CPUs, %4\$d for log file, and %5\$s for task name." \
|
181
|
+
"$dtype -q '$dqueue' -v '%2\$s' -l nodes=1:ppn=%3\$d %1\$s -j oe -o '%4\$s' -N '%5\$s' | grep .")
|
182
|
+
dvar=$(ask_user "How should I pass variables? Use %1\$s for keys and %2\$s for values." "%1\$s=%2\$s")
|
183
|
+
dsep=$(ask_user "What should I use to separate variables?" ",")
|
184
|
+
if [[ "$dtype" == "qsub" ]] ; then
|
185
|
+
dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for job id, output should be 1 for running and 0 for non-running." \
|
186
|
+
"qstat -f '%1\$s'|grep ' job_state ='|perl -pe 's/.*= //'|grep '[^C]'|tail -n1|wc -l|awk '{print \$1}'")
|
187
|
+
else
|
188
|
+
dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for job id, output should be 1 for running and 0 for non-running." \
|
189
|
+
"checkjob '%1\$s'|grep '^State:'|perl -pe 's/.*: //'|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'|tail -n1|wc -l|awk '{print \$1}'")
|
190
|
+
fi
|
191
|
+
;;
|
192
|
+
*)
|
193
|
+
esac
|
194
|
+
echo "{
|
195
|
+
\"created\": \"$(date "+%Y-%m-%d %H:%M:%S %z")\",
|
196
|
+
\"updated\": \"$(date "+%Y-%m-%d %H:%M:%S %z")\",
|
197
|
+
\"type\" : \"$dtype\",
|
198
|
+
\"cmd\" : \"$dcmd\",
|
199
|
+
\"var\" : \"$dvar\",
|
200
|
+
\"varsep\" : \"$dsep\",
|
201
|
+
\"alive\" : \"$dalive\",
|
202
|
+
\"latency\": $dlatency,
|
203
|
+
\"maxjobs\": $dmaxjobs,
|
204
|
+
\"ppn\" : $dppn
|
205
|
+
}" > $HOME/.miga_daemon.json
|
206
|
+
|
207
|
+
# Confirm configuration
|
208
|
+
echo "
|
209
|
+
MIGA_CONFIG_DATE='$(date "+%Y-%m-%d %H:%M:%S %z")'
|
210
|
+
" >> "$HOME/.miga_rc"
|
211
|
+
|
data/scripts/miga.bash
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
set -e
|
3
|
+
#MIGA=${MIGA:-$(cd "$(dirname "$0")/.."; pwd)}
|
4
|
+
source "$HOME/.miga_rc"
|
5
|
+
export PATH="$MIGA/bin:$PATH"
|
6
|
+
|
7
|
+
function exists { [[ -e "$1" ]] ; }
|
8
|
+
|
9
|
+
#if [[ "$RUNTYPE" == "qsub" ]] ; then
|
10
|
+
#elif [[ "$RUNTYPE" == "msub" ]] ; then
|
11
|
+
#fi
|
12
|
+
|