miga-base 0.2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +351 -0
- data/actions/add_result +61 -0
- data/actions/add_taxonomy +86 -0
- data/actions/create_dataset +62 -0
- data/actions/create_project +70 -0
- data/actions/daemon +69 -0
- data/actions/download_dataset +77 -0
- data/actions/find_datasets +63 -0
- data/actions/import_datasets +86 -0
- data/actions/index_taxonomy +71 -0
- data/actions/list_datasets +83 -0
- data/actions/list_files +67 -0
- data/actions/unlink_dataset +52 -0
- data/bin/miga +48 -0
- data/lib/miga/daemon.rb +178 -0
- data/lib/miga/dataset.rb +286 -0
- data/lib/miga/gui.rb +289 -0
- data/lib/miga/metadata.rb +74 -0
- data/lib/miga/project.rb +268 -0
- data/lib/miga/remote_dataset.rb +154 -0
- data/lib/miga/result.rb +102 -0
- data/lib/miga/tax_index.rb +70 -0
- data/lib/miga/taxonomy.rb +107 -0
- data/lib/miga.rb +83 -0
- data/scripts/_distances_noref_nomulti.bash +86 -0
- data/scripts/_distances_ref_nomulti.bash +105 -0
- data/scripts/aai_distances.bash +40 -0
- data/scripts/ani_distances.bash +39 -0
- data/scripts/assembly.bash +38 -0
- data/scripts/cds.bash +45 -0
- data/scripts/clade_finding.bash +27 -0
- data/scripts/distances.bash +30 -0
- data/scripts/essential_genes.bash +29 -0
- data/scripts/haai_distances.bash +39 -0
- data/scripts/init.bash +211 -0
- data/scripts/miga.bash +12 -0
- data/scripts/mytaxa.bash +93 -0
- data/scripts/mytaxa_scan.bash +85 -0
- data/scripts/ogs.bash +36 -0
- data/scripts/read_quality.bash +37 -0
- data/scripts/ssu.bash +35 -0
- data/scripts/subclades.bash +26 -0
- data/scripts/trimmed_fasta.bash +47 -0
- data/scripts/trimmed_reads.bash +57 -0
- data/utils/adapters.fa +302 -0
- data/utils/mytaxa_scan.R +89 -0
- data/utils/mytaxa_scan.rb +58 -0
- data/utils/requirements.txt +19 -0
- data/utils/subclades-compile.rb +48 -0
- data/utils/subclades.R +171 -0
- metadata +185 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $DATASET, $RUNTYPE, $MIGA, $CORES, $TMPDIR,
|
3
|
+
# $NOMULTI, $REF
|
4
|
+
|
5
|
+
function checkpoint_n {
|
6
|
+
if [[ $N -eq 10 ]] ; then
|
7
|
+
for t in 01.haai 02.aai 03.ani ; do
|
8
|
+
if [[ -s $TMPDIR/$t.db ]] ; then
|
9
|
+
tab="aai"
|
10
|
+
[[ "$t" == "03.ani" ]] && tab="ani"
|
11
|
+
echo "select count(*) from $tab;" \
|
12
|
+
| sqlite3 $TMPDIR/$t.db\
|
13
|
+
|| exit 1
|
14
|
+
cp $TMPDIR/$t.db $t/$DATASET.db
|
15
|
+
fi
|
16
|
+
done
|
17
|
+
N=0
|
18
|
+
fi
|
19
|
+
let N=$N+1
|
20
|
+
}
|
21
|
+
|
22
|
+
ESS="../07.annotation/01.function/01.essential"
|
23
|
+
|
24
|
+
# Initialize temporals
|
25
|
+
for t in 01.haai 02.aai 03.ani ; do
|
26
|
+
[[ -s $t/$DATASET.db ]] && cp $t/$DATASET.db $TMPDIR/$t.db
|
27
|
+
done
|
28
|
+
echo "create table if not exists aai(seq1 varchar(256), seq2 varchar(256)," \
|
29
|
+
"aai float, sd float, n int, omega int);" | sqlite3 $TMPDIR/02.aai.db
|
30
|
+
N=1
|
31
|
+
|
32
|
+
# Traverse "nearly-half" of the ref-datasets using first-come-first-served
|
33
|
+
for i in $(miga list_datasets -P "$PROJECT" --ref --no-multi) ; do
|
34
|
+
echo "=[ $i ]"
|
35
|
+
date "+%Y-%m-%d %H:%M:%S %z"
|
36
|
+
HAAI=""; AAI=""; ANI="";
|
37
|
+
# Check if the i-th dataset is ready
|
38
|
+
[[ -s $ESS/$i.done && -s $ESS/$i.json ]] || continue
|
39
|
+
# Check if this is done (e.g., in a previous failed iteration)
|
40
|
+
AAI=$(echo "select aai from aai where seq1='$DATASET' and seq2='$i';" \
|
41
|
+
| sqlite3 $TMPDIR/02.aai.db || echo "")
|
42
|
+
# Try the other direction
|
43
|
+
if [[ "$AAI" == "" && -s 02.aai/$i.db ]] ; then
|
44
|
+
cp "02.aai/$i.db" "$TMPDIR/$i.db"
|
45
|
+
AAI=$(echo "select aai from aai where seq2='$DATASET' and seq1='$i';" \
|
46
|
+
| sqlite3 "$TMPDIR/$i.db" || echo "")
|
47
|
+
rm "$TMPDIR/$i.db"
|
48
|
+
fi
|
49
|
+
# Try with hAAI
|
50
|
+
if [[ "$AAI" == "" ]] ; then
|
51
|
+
[[ -e "$TMPDIR/$DATASET.ess.faa" ]] \
|
52
|
+
|| cp $ESS/$DATASET.ess.faa $TMPDIR/$DATASET.ess.faa
|
53
|
+
HAAI=$(aai.rb -1 $TMPDIR/$DATASET.ess.faa -2 $ESS/$i.ess.faa \
|
54
|
+
-t $CORES -a -n 10 -S $TMPDIR/01.haai.db --name1 $DATASET \
|
55
|
+
--name2 $i --lookup-first --no-save-rbm || echo "")
|
56
|
+
if [[ "$HAAI" != "" \
|
57
|
+
&& $(perl -MPOSIX -e "print floor $HAAI") -lt 90 ]] ; then
|
58
|
+
AAI=$(perl -e \
|
59
|
+
"printf '%f', 100-exp(2.435076 + 0.4275193*log(100-$HAAI))")
|
60
|
+
echo "insert into aai values('$DATASET','$i','$AAI',0,0,0);" \
|
61
|
+
| sqlite3 $TMPDIR/02.aai.db
|
62
|
+
fi
|
63
|
+
fi
|
64
|
+
# Try with complete AAI
|
65
|
+
if [[ "$AAI" == "" ]] ; then
|
66
|
+
[[ -e "$TMPDIR/$DATASET.faa" ]] \
|
67
|
+
|| cp ../06.cds/$DATASET.faa $TMPDIR/$DATASET.faa
|
68
|
+
AAI=$(aai.rb -1 $TMPDIR/$DATASET.faa -2 ../06.cds/$i.faa -t $CORES -a \
|
69
|
+
-S $TMPDIR/02.aai.db --name1 $DATASET --name2 $i --lookup-first \
|
70
|
+
|| echo "")
|
71
|
+
fi
|
72
|
+
date "+%Y-%m-%d %H:%M:%S %z"
|
73
|
+
# Check if ANI is meaningful
|
74
|
+
if [[ -e "../05.assembly/$DATASET.LargeContigs.fna" \
|
75
|
+
&& -e "../05.assembly/$i.LargeContigs.fna" \
|
76
|
+
&& $(perl -MPOSIX -e "print ceil $AAI") -gt 90 ]] ; then
|
77
|
+
# Check if this is done (e.g., in a previous failed iteration)
|
78
|
+
ANI=$(echo "select ani from ani where seq1='$DATASET' and seq2='$i';" \
|
79
|
+
| sqlite3 $TMPDIR/03.ani.db || echo "")
|
80
|
+
# Try the other direction
|
81
|
+
if [[ "$ANI" == "" && -s 03.ani/$i.db ]] ; then
|
82
|
+
cp "03.ani/$i.db" "$TMPDIR/$i.db"
|
83
|
+
ANI=$(echo "select ani from ani" \
|
84
|
+
"where seq2='$DATASET' and seq1='$i';" \
|
85
|
+
| sqlite3 "$TMPDIR/$i.db" || echo "")
|
86
|
+
rm "$TMPDIR/$i.db"
|
87
|
+
fi
|
88
|
+
# Calculate it
|
89
|
+
if [[ "$ANI" == "" ]] ; then
|
90
|
+
[[ -e "$TMPDIR/$DATASET.LargeContigs.fna" ]] \
|
91
|
+
|| cp ../05.assembly/$DATASET.LargeContigs.fna \
|
92
|
+
$TMPDIR/$DATASET.LargeContigs.fna
|
93
|
+
ANI=$(ani.rb -1 $TMPDIR/$DATASET.LargeContigs.fna \
|
94
|
+
-2 ../05.assembly/$i.LargeContigs.fna -t $CORES \
|
95
|
+
-S $TMPDIR/03.ani.db -a --name1 $DATASET --name2 $i \
|
96
|
+
--no-save-regions --no-save-rbm --lookup-first \
|
97
|
+
|| echo "")
|
98
|
+
fi
|
99
|
+
fi
|
100
|
+
echo "$AAI;$ANI"
|
101
|
+
checkpoint_n
|
102
|
+
done
|
103
|
+
N=10
|
104
|
+
checkpoint_n
|
105
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/09.distances/02.aai"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
|
10
|
+
|
11
|
+
echo -n "" > miga-project.log
|
12
|
+
DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
|
13
|
+
|
14
|
+
# Extract values
|
15
|
+
echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
|
16
|
+
for i in $DS ; do
|
17
|
+
echo "SELECT CASE WHEN omega!=0 THEN 'AAI' ELSE 'hAAI_AAI' END," \
|
18
|
+
" seq1, seq2, aai, sd, n, omega from aai;" \
|
19
|
+
| sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
|
20
|
+
echo "$i" >> miga-project.log
|
21
|
+
done
|
22
|
+
|
23
|
+
# R-ify
|
24
|
+
echo "
|
25
|
+
aai <- read.table('miga-project.txt', sep='\\t', h=T);
|
26
|
+
save(aai, file='miga-project.Rdata');
|
27
|
+
h <- hist(aai[,'value'], breaks=100, plot=FALSE);
|
28
|
+
write.table(
|
29
|
+
cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
|
30
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
31
|
+
col.names=FALSE, row.names=FALSE);
|
32
|
+
" | R --vanilla
|
33
|
+
|
34
|
+
# Gzip
|
35
|
+
gzip -9 -f miga-project.txt
|
36
|
+
|
37
|
+
# Finalize
|
38
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
39
|
+
miga add_result -P "$PROJECT" -r aai_distances
|
40
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/09.distances/03.ani"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
|
10
|
+
|
11
|
+
echo -n "" > miga-project.log
|
12
|
+
DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
|
13
|
+
|
14
|
+
# Extract values
|
15
|
+
echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
|
16
|
+
for i in $DS ; do
|
17
|
+
echo "SELECT 'ANI', seq1, seq2, ani, sd, n, omega from ani ;" \
|
18
|
+
| sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
|
19
|
+
echo "$i" >> miga-project.log
|
20
|
+
done
|
21
|
+
|
22
|
+
# R-ify
|
23
|
+
echo "
|
24
|
+
ani <- read.table('miga-project.txt', sep='\\t', h=T);
|
25
|
+
save(ani, file='miga-project.Rdata');
|
26
|
+
h <- hist(ani[,'value'], breaks=100, plot=FALSE);
|
27
|
+
write.table(
|
28
|
+
cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
|
29
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
30
|
+
col.names=FALSE, row.names=FALSE);
|
31
|
+
" | R --vanilla
|
32
|
+
|
33
|
+
# Gzip
|
34
|
+
gzip -9 -f miga-project.txt
|
35
|
+
|
36
|
+
# Finalize
|
37
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
38
|
+
miga add_result -P "$PROJECT" -r ani_distances
|
39
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/05.assembly"
|
7
|
+
|
8
|
+
b=$DATASET
|
9
|
+
|
10
|
+
# Initialize
|
11
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
12
|
+
|
13
|
+
# Assemble
|
14
|
+
FA="../04.trimmed_fasta/$DATASET.CoupledReads.fa"
|
15
|
+
[[ -e $FA ]] || FA="$FA.gz"
|
16
|
+
[[ -e $FA ]] || FA="../04.trimmed_fasta/$DATASET.SingleReads.fa"
|
17
|
+
[[ -e $FA ]] || FA="$FA.gz"
|
18
|
+
idba_ud --pre_correction -r "$FA" -o "$DATASET" --num_threads "$CORES"
|
19
|
+
|
20
|
+
# Clean
|
21
|
+
cd $DATASET
|
22
|
+
rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa
|
23
|
+
cd ..
|
24
|
+
|
25
|
+
# Extract
|
26
|
+
if [[ -s $DATASET/scaffold.fa ]] ; then
|
27
|
+
ln -s $DATASET/scaffold.fa $DATASET.AllContigs.fna
|
28
|
+
else
|
29
|
+
ln -s $DATASET/contig.fa $DATASET.AllContigs.fna
|
30
|
+
fi
|
31
|
+
FastA.length.pl $DATASET.AllContigs.fna | awk '$2>=1000{print $1}' \
|
32
|
+
| FastA.filter.pl /dev/stdin $DATASET.AllContigs.fna \
|
33
|
+
> $DATASET.LargeContigs.fna
|
34
|
+
|
35
|
+
# Finalize
|
36
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
37
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r assembly
|
38
|
+
|
data/scripts/cds.bash
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/06.cds"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
10
|
+
GM=$(dirname -- $(which gmhmmp))
|
11
|
+
|
12
|
+
# Register key
|
13
|
+
if [[ ! -e .gm_key ]] ; then
|
14
|
+
if [[ -e "$GM/gm_key" ]] ; then
|
15
|
+
cp "$GM/gm_key" ".gm_key"
|
16
|
+
elif [[ -e "$GM/gm_key_64" ]] ; then
|
17
|
+
cp "$GM/gm_key_64" ".gm_key"
|
18
|
+
elif [[ -e "$GM/gm_key_32" ]] ; then
|
19
|
+
cp "$GM/gm_key_32" ".gm_key"
|
20
|
+
elif [[ -e "$GM/.gm_key" ]] ; then
|
21
|
+
cp "$GM/.gm_key" ".gm_key"
|
22
|
+
elif [[ -e "$HOME/.gm_key" ]] ; then
|
23
|
+
cp "$HOME/.gm_key" .
|
24
|
+
else
|
25
|
+
echo "Impossible to find MetaGeneMark key, please register your copy" \
|
26
|
+
"and place the key in '$GM/gm_key'." >&2
|
27
|
+
exit 1
|
28
|
+
fi
|
29
|
+
fi
|
30
|
+
|
31
|
+
# Run MetaGeneMark
|
32
|
+
gmhmmp -a -d -m "$GM/MetaGeneMark_v1.mod" -f G -o "$DATASET.gff2" \
|
33
|
+
"../05.assembly/$DATASET.LargeContigs.fna"
|
34
|
+
|
35
|
+
# Extract
|
36
|
+
perl "$GM/aa_from_gff.pl" < "$DATASET.gff2" > "$DATASET.faa"
|
37
|
+
perl "$GM/nt_from_gff.pl" < "$DATASET.gff2" > "$DATASET.fna"
|
38
|
+
|
39
|
+
# Gzip
|
40
|
+
gzip -9 -f "$DATASET.gff2"
|
41
|
+
|
42
|
+
# Finalize
|
43
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
44
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r cds
|
45
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/10.clades/01.find"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
|
10
|
+
|
11
|
+
# Markov-cluster genomes by ANI
|
12
|
+
gunzip -c ../../09.distances/03.ani/miga-project.txt.gz | tail -n+2 \
|
13
|
+
| awk -F"\\t" '{print $2"'"\\t"'"$3"'"\\t"'"$4}' > genome-genome.aai90.rbm
|
14
|
+
ogs.mcl.rb -d . -o miga-project.ani-clades -t "$CORES" -i \
|
15
|
+
-f "(\\S+)-(\\S+)\\.aai90\\.rbm"
|
16
|
+
cat genome-genome.aai90.rbm | awk -F"\\t" '$3>=95' > genome-genome.ani95.rbm
|
17
|
+
ogs.mcl.rb -d . -o miga-project.ani95-clades -t "$CORES" -b \
|
18
|
+
-f "(\\S+)-(\\S+)\\.ani95\\.rbm"
|
19
|
+
|
20
|
+
# Propose clade projects
|
21
|
+
cat miga-project.ani95-clades | tail -n +2 | tr "," "\\t" | awk 'NF >= 5' \
|
22
|
+
> miga-project.proposed-clades
|
23
|
+
|
24
|
+
# Finalize
|
25
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
26
|
+
miga add_result -P "$PROJECT" -r clade_finding
|
27
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $DATASET, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/09.distances"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
10
|
+
TMPDIR=$(mktemp -d /tmp/MiGA.XXXXXXXXXXXX)
|
11
|
+
trap "rm -rf $TMPDIR; exit" SIGHUP SIGINT SIGTERM
|
12
|
+
|
13
|
+
# Check type of dataset
|
14
|
+
NOMULTI=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --no-multi \
|
15
|
+
| wc -l | awk '{print $1}')
|
16
|
+
REF=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --ref \
|
17
|
+
| wc -l | awk '{print $1}')
|
18
|
+
|
19
|
+
# Call submodule
|
20
|
+
if [[ "$NOMULTI" -eq "1" && "$REF" -eq "1" ]] ; then
|
21
|
+
source "$MIGA/scripts/_distances_ref_nomulti.bash"
|
22
|
+
elif [[ "$NOMULTI" -eq "1" ]] ; then
|
23
|
+
source "$MIGA/scripts/_distances_noref_nomulti.bash"
|
24
|
+
fi
|
25
|
+
|
26
|
+
# Finalize
|
27
|
+
rm -R $TMPDIR
|
28
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
29
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r distances
|
30
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/07.annotation/01.function/01.essential"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
|
10
|
+
|
11
|
+
# Find and extract essential genes
|
12
|
+
[[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
|
13
|
+
mkdir "$DATASET.ess"
|
14
|
+
TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
|
15
|
+
--metadata "type" | awk '{print $2}')
|
16
|
+
if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
|
17
|
+
HMM.essential.rb -i "../../../06.cds/$DATASET.faa" -o "$DATASET.ess.faa" \
|
18
|
+
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
|
19
|
+
> "$DATASET.ess/log"
|
20
|
+
else
|
21
|
+
HMM.essential.rb -i "../../../06.cds/$DATASET.faa" -o "$DATASET.ess.faa" \
|
22
|
+
-m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
|
23
|
+
> "$DATASET.ess/log"
|
24
|
+
fi
|
25
|
+
|
26
|
+
# Finalize
|
27
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
|
28
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r essential
|
29
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
|
3
|
+
echo "MiGA: $MIGA"
|
4
|
+
echo "Project: $PROJECT"
|
5
|
+
source "$MIGA/scripts/miga.bash" || exit 1
|
6
|
+
cd "$PROJECT/data/09.distances/01.haai"
|
7
|
+
|
8
|
+
# Initialize
|
9
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
|
10
|
+
|
11
|
+
echo -n "" > miga-project.log
|
12
|
+
DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
|
13
|
+
|
14
|
+
# Extract values
|
15
|
+
echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
|
16
|
+
for i in $DS ; do
|
17
|
+
echo "SELECT 'hAAI', seq1, seq2, aai, sd, n, omega from aai ;" \
|
18
|
+
| sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
|
19
|
+
echo "$i" >> miga-project.log
|
20
|
+
done
|
21
|
+
|
22
|
+
# R-ify
|
23
|
+
echo "
|
24
|
+
haai <- read.table('miga-project.txt', sep='\\t', h=T);
|
25
|
+
save(haai, file='miga-project.Rdata');
|
26
|
+
h <- hist(haai[,'value'], breaks=100, plot=FALSE);
|
27
|
+
write.table(
|
28
|
+
cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
|
29
|
+
file='miga-project.hist', quote=FALSE, sep='\\t',
|
30
|
+
col.names=FALSE, row.names=FALSE);
|
31
|
+
" | R --vanilla
|
32
|
+
|
33
|
+
# Gzip
|
34
|
+
gzip -9 -f miga-project.txt
|
35
|
+
|
36
|
+
# Finalize
|
37
|
+
date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
|
38
|
+
miga add_result -P "$PROJECT" -r haai_distances
|
39
|
+
|
data/scripts/init.bash
ADDED
@@ -0,0 +1,211 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
set -e
|
3
|
+
|
4
|
+
#=======[ Functions ]
|
5
|
+
function ask_user {
|
6
|
+
local question=$1
|
7
|
+
local default=$2
|
8
|
+
echo $question >&2
|
9
|
+
echo -n " [$default] > " >&2
|
10
|
+
read user_answer
|
11
|
+
user_answer=${user_answer:-$default}
|
12
|
+
echo -n "$user_answer"
|
13
|
+
}
|
14
|
+
|
15
|
+
function check_req {
|
16
|
+
local bin=$1
|
17
|
+
local default=$(dirname "$(which "$bin")")
|
18
|
+
user_answer=$(ask_user "Where can I find $2 ($3)? $4" "$default")
|
19
|
+
if [[ -x "$user_answer/$bin" ]] ; then
|
20
|
+
export PATH="$PATH:$user_answer"
|
21
|
+
echo "MIGA_PATH=\"$user_answer:\$MIGA_PATH\" # $2" >> "$HOME/.miga_rc"
|
22
|
+
else
|
23
|
+
echo "Cannot find $2 at '$user_answer/$bin'. Aborting..." >&2
|
24
|
+
exit 1
|
25
|
+
fi
|
26
|
+
}
|
27
|
+
|
28
|
+
function check_rlib {
|
29
|
+
local rlib=$1
|
30
|
+
gotit=$(echo "if(require($rlib)) cat('GOT','IT')" | R --vanilla -q 2>&1 \
|
31
|
+
| grep "GOT IT" | wc -l | awk '{print $1}')
|
32
|
+
[[ "$gotit" == "1" ]]
|
33
|
+
}
|
34
|
+
|
35
|
+
function check_gem {
|
36
|
+
local gem=$1
|
37
|
+
gotit=$(echo "require '$gem'" | ruby 2>/dev/null && echo 1)
|
38
|
+
[[ "$gotit" == "1" ]]
|
39
|
+
}
|
40
|
+
|
41
|
+
#=======[ Main ]
|
42
|
+
MIGA_STARTUP="no"
|
43
|
+
MIGA=$(cd "$(dirname "$0")/.."; pwd)
|
44
|
+
echo "
|
45
|
+
===[ Welcome to MiGA, the Microbial Genome Atlas ]===
|
46
|
+
|
47
|
+
I'm the initialization script, and I'll sniff around your computer to
|
48
|
+
make sure you have all the requirements for MiGA Daemons.
|
49
|
+
" >&2
|
50
|
+
|
51
|
+
if [[ "$(ask_user "Would you like to see all the requirements before starting? (yes / no)" "no")" == "yes" ]] ; then
|
52
|
+
echo "" >&2
|
53
|
+
cat "$MIGA/utils/requirements.txt" >&2
|
54
|
+
echo "" >&2
|
55
|
+
fi
|
56
|
+
|
57
|
+
if [[ -e "$HOME/.miga_rc" ]] ; then
|
58
|
+
case "$(ask_user "I found a previous configuration. Do you want to load the defaults within? (yes / no / cancel)" "yes")" in
|
59
|
+
yes)
|
60
|
+
source "$HOME/.miga_rc"
|
61
|
+
if [[ "$MIGA_CONFIG_DATE" == "" ]] ; then
|
62
|
+
echo " Loaded incomplete configuration" >&2
|
63
|
+
else
|
64
|
+
echo " Loaded configuration from $MIGA_CONFIG_DATE" >&2
|
65
|
+
fi
|
66
|
+
;;
|
67
|
+
no)
|
68
|
+
rm "$HOME/.miga_rc"
|
69
|
+
;;
|
70
|
+
cancel)
|
71
|
+
exit 0
|
72
|
+
;;
|
73
|
+
*)
|
74
|
+
echo "Cannot understand your answer, please use 'yes', 'no', or 'cancel'. Aborting..." >&2
|
75
|
+
exit 1
|
76
|
+
esac
|
77
|
+
fi
|
78
|
+
|
79
|
+
echo "#!/bin/bash
|
80
|
+
# MiGA made this on $(date)
|
81
|
+
" > "$HOME/.miga_rc"
|
82
|
+
|
83
|
+
# Check Software requirements
|
84
|
+
MIGA_STARTUP=$(ask_user "Is there a script I need to load at startup? (no / path to the script to load)" "$MIGA_STARTUP")
|
85
|
+
if [[ "$MIGA_STARTUP" != "no" ]] ; then
|
86
|
+
echo "MIGA_STARTUP='$MIGA_STARTUP'
|
87
|
+
source \"\$MIGA_STARTUP\"
|
88
|
+
" >> "$HOME/.miga_rc";
|
89
|
+
source "$MIGA_STARTUP";
|
90
|
+
fi
|
91
|
+
echo "
|
92
|
+
Looking for Software requirements:" >&2
|
93
|
+
reqs=$(tail -n+3 "$MIGA/utils/requirements.txt" | perl -pe 's/\t+/\t/g')
|
94
|
+
IFS_BU=$IFS
|
95
|
+
IFS=$'\n'
|
96
|
+
for ln in $reqs ; do
|
97
|
+
rname=$(echo "$ln" | awk -F'\t' '{print $1}')
|
98
|
+
rtest=$(echo "$ln" | awk -F'\t' '{print $2}')
|
99
|
+
rwebs=$(echo "$ln" | awk -F'\t' '{print $3}')
|
100
|
+
rhint=$(echo "$ln" | awk -F'\t' '{print $4}')
|
101
|
+
check_req "$rtest" "$rname" "$rwebs" "$rhint"
|
102
|
+
done
|
103
|
+
IFS=$IFS_BU
|
104
|
+
echo "export PATH=\$MIGA_PATH\$PATH" >> "$HOME/.miga_rc"
|
105
|
+
|
106
|
+
# Check for R packages
|
107
|
+
echo "
|
108
|
+
Looking for R packages:" >&2
|
109
|
+
if ! check_rlib enveomics.R ; then
|
110
|
+
echo "+ Installing enveomics.R" >&2
|
111
|
+
R CMD INSTALL $(dirname "$(which "FastQ.tag.rb")")/../enveomics.R
|
112
|
+
fi
|
113
|
+
RLIBS="ape ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
|
114
|
+
for lib in $RLIBS ; do
|
115
|
+
if ! check_rlib $lib ; then
|
116
|
+
echo "+ Installing $lib" >&2
|
117
|
+
echo "install.packages('$lib', repos='http://cran.rstudio.com/')" \
|
118
|
+
| R --vanilla -q
|
119
|
+
fi
|
120
|
+
done
|
121
|
+
|
122
|
+
# Check for ruby gems
|
123
|
+
echo "
|
124
|
+
Looking for Ruby gems:" >&2
|
125
|
+
GEMS="rest_client sqlite3 daemons json"
|
126
|
+
for gem in $GEMS ; do
|
127
|
+
if ! check_gem $gem ; then
|
128
|
+
echo "+ Installing $gem (user-only)" >&2
|
129
|
+
gem install --user $gem
|
130
|
+
fi
|
131
|
+
done
|
132
|
+
|
133
|
+
# Check for other files
|
134
|
+
echo "
|
135
|
+
Looking for additional files:
|
136
|
+
+ MetaGeneMark license key" >&2
|
137
|
+
GM=$(dirname -- $(which gmhmmp))
|
138
|
+
if [[ ! -e "$GM/gm_key" && ! -e "$GM/gm_key_64" && ! -e "$GM/gm_key_32" && ! -e "$GM/.gm_key" && ! -e "$HOME/.gm_key" ]] ; then
|
139
|
+
echo "Cannot find it, please place your license key in '$GM/gm_key'. Aborting..." >&2
|
140
|
+
exit 1
|
141
|
+
fi
|
142
|
+
echo "+ MetaGeneMark scripts" >&2
|
143
|
+
if [[ ! -e "$GM/aa_from_gff.pl" || ! -e "$GM/nt_from_gff.pl" ]] ; then
|
144
|
+
echo "Cannot find it, please place aa_from_gff.pl and nt_from_gff.pl in '$GM/'. Aborting..." >&2
|
145
|
+
exit 1
|
146
|
+
fi
|
147
|
+
echo "+ MyTaxa scores database" >&2
|
148
|
+
MT=$(dirname -- $(which MyTaxa))
|
149
|
+
if [[ ! -d "$MT/db" ]] ; then
|
150
|
+
echo "Cannot find it, please execute 'python $MT/utils/download_db.py'. Aborting..." >&2
|
151
|
+
exit 1
|
152
|
+
fi
|
153
|
+
echo "+ MyTaxa DIAMOND database" >&2
|
154
|
+
if [[ ! -e "$MT/AllGenomes.faa.dmnd" ]] ; then
|
155
|
+
echo "Cannot find it, please download 'http://enve-omics.ce.gatech.edu/data/public_mytaxa/AllGenomes.faa.dmnd' into '$MT'. Aborting..." >&2
|
156
|
+
exit 1
|
157
|
+
fi
|
158
|
+
|
159
|
+
# Configure daemon
|
160
|
+
echo "
|
161
|
+
Default daemon configuration:" >&2
|
162
|
+
dtype=$(ask_user "Please select the type of daemon you want to setup (bash / qsub / msub)", "bash")
|
163
|
+
case "$dtype" in
|
164
|
+
bash)
|
165
|
+
dlatency=$(ask_user "For how long should I sleep? (# in seconds)" "30")
|
166
|
+
dmaxjobs=$(ask_user "How many jobs can I launch at once?" "6")
|
167
|
+
dppn=$(ask_user "How many CPUs can I use per job?" "2")
|
168
|
+
echo "Setting up internal daemon defaults, if you don't understand this just leave default values:" >&2
|
169
|
+
dcmd=$(ask_user "How should I launch tasks? Use %1\$s for script path, %2\$s for variables, %3\$d for CPUs, %4\$s for log file, and %5\$s for task name." "%2\$s '%1\$s' &> '%4\$s'")
|
170
|
+
dvar=$(ask_user "How should I pass variables? Use %1\$s for keys and %2\$s for values." "%1\$s=%2\$s")
|
171
|
+
dsep=$(ask_user "What should I use to separate variables?" " ")
|
172
|
+
dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for PID, output should be 1 for running and 0 for non-running." "ps -p '%1\$s'|tail -n+2|wc -l|awk '{print \$1}'")
|
173
|
+
;;
|
174
|
+
[qm]sub)
|
175
|
+
dqueue=$(ask_user "What's the name of the queue I should use?" "")
|
176
|
+
dlatency=$(ask_user "How long should I sleep? (# in seconds)" "150")
|
177
|
+
dmaxjobs=$(ask_user "How many jobs can I launch at once?" "300")
|
178
|
+
dppn=$(ask_user "How many CPUs can I use per job?" "4")
|
179
|
+
echo "Setting up internal daemon defaults, if you don't understand this just leave default values:" >&2
|
180
|
+
dcmd=$(ask_user "How should I launch tasks? Use %1\$s for script path, %2\$s for variables, and %3\$d for CPUs, %4\$d for log file, and %5\$s for task name." \
|
181
|
+
"$dtype -q '$dqueue' -v '%2\$s' -l nodes=1:ppn=%3\$d %1\$s -j oe -o '%4\$s' -N '%5\$s' | grep .")
|
182
|
+
dvar=$(ask_user "How should I pass variables? Use %1\$s for keys and %2\$s for values." "%1\$s=%2\$s")
|
183
|
+
dsep=$(ask_user "What should I use to separate variables?" ",")
|
184
|
+
if [[ "$dtype" == "qsub" ]] ; then
|
185
|
+
dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for job id, output should be 1 for running and 0 for non-running." \
|
186
|
+
"qstat -f '%1\$s'|grep ' job_state ='|perl -pe 's/.*= //'|grep '[^C]'|tail -n1|wc -l|awk '{print \$1}'")
|
187
|
+
else
|
188
|
+
dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for job id, output should be 1 for running and 0 for non-running." \
|
189
|
+
"checkjob '%1\$s'|grep '^State:'|perl -pe 's/.*: //'|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'|tail -n1|wc -l|awk '{print \$1}'")
|
190
|
+
fi
|
191
|
+
;;
|
192
|
+
*)
|
193
|
+
esac
|
194
|
+
echo "{
|
195
|
+
\"created\": \"$(date "+%Y-%m-%d %H:%M:%S %z")\",
|
196
|
+
\"updated\": \"$(date "+%Y-%m-%d %H:%M:%S %z")\",
|
197
|
+
\"type\" : \"$dtype\",
|
198
|
+
\"cmd\" : \"$dcmd\",
|
199
|
+
\"var\" : \"$dvar\",
|
200
|
+
\"varsep\" : \"$dsep\",
|
201
|
+
\"alive\" : \"$dalive\",
|
202
|
+
\"latency\": $dlatency,
|
203
|
+
\"maxjobs\": $dmaxjobs,
|
204
|
+
\"ppn\" : $dppn
|
205
|
+
}" > $HOME/.miga_daemon.json
|
206
|
+
|
207
|
+
# Confirm configuration
|
208
|
+
echo "
|
209
|
+
MIGA_CONFIG_DATE='$(date "+%Y-%m-%d %H:%M:%S %z")'
|
210
|
+
" >> "$HOME/.miga_rc"
|
211
|
+
|
data/scripts/miga.bash
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
set -e
|
3
|
+
#MIGA=${MIGA:-$(cd "$(dirname "$0")/.."; pwd)}
|
4
|
+
source "$HOME/.miga_rc"
|
5
|
+
export PATH="$MIGA/bin:$PATH"
|
6
|
+
|
7
|
+
function exists { [[ -e "$1" ]] ; }
|
8
|
+
|
9
|
+
#if [[ "$RUNTYPE" == "qsub" ]] ; then
|
10
|
+
#elif [[ "$RUNTYPE" == "msub" ]] ; then
|
11
|
+
#fi
|
12
|
+
|