miga-base 0.2.0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +351 -0
  3. data/actions/add_result +61 -0
  4. data/actions/add_taxonomy +86 -0
  5. data/actions/create_dataset +62 -0
  6. data/actions/create_project +70 -0
  7. data/actions/daemon +69 -0
  8. data/actions/download_dataset +77 -0
  9. data/actions/find_datasets +63 -0
  10. data/actions/import_datasets +86 -0
  11. data/actions/index_taxonomy +71 -0
  12. data/actions/list_datasets +83 -0
  13. data/actions/list_files +67 -0
  14. data/actions/unlink_dataset +52 -0
  15. data/bin/miga +48 -0
  16. data/lib/miga/daemon.rb +178 -0
  17. data/lib/miga/dataset.rb +286 -0
  18. data/lib/miga/gui.rb +289 -0
  19. data/lib/miga/metadata.rb +74 -0
  20. data/lib/miga/project.rb +268 -0
  21. data/lib/miga/remote_dataset.rb +154 -0
  22. data/lib/miga/result.rb +102 -0
  23. data/lib/miga/tax_index.rb +70 -0
  24. data/lib/miga/taxonomy.rb +107 -0
  25. data/lib/miga.rb +83 -0
  26. data/scripts/_distances_noref_nomulti.bash +86 -0
  27. data/scripts/_distances_ref_nomulti.bash +105 -0
  28. data/scripts/aai_distances.bash +40 -0
  29. data/scripts/ani_distances.bash +39 -0
  30. data/scripts/assembly.bash +38 -0
  31. data/scripts/cds.bash +45 -0
  32. data/scripts/clade_finding.bash +27 -0
  33. data/scripts/distances.bash +30 -0
  34. data/scripts/essential_genes.bash +29 -0
  35. data/scripts/haai_distances.bash +39 -0
  36. data/scripts/init.bash +211 -0
  37. data/scripts/miga.bash +12 -0
  38. data/scripts/mytaxa.bash +93 -0
  39. data/scripts/mytaxa_scan.bash +85 -0
  40. data/scripts/ogs.bash +36 -0
  41. data/scripts/read_quality.bash +37 -0
  42. data/scripts/ssu.bash +35 -0
  43. data/scripts/subclades.bash +26 -0
  44. data/scripts/trimmed_fasta.bash +47 -0
  45. data/scripts/trimmed_reads.bash +57 -0
  46. data/utils/adapters.fa +302 -0
  47. data/utils/mytaxa_scan.R +89 -0
  48. data/utils/mytaxa_scan.rb +58 -0
  49. data/utils/requirements.txt +19 -0
  50. data/utils/subclades-compile.rb +48 -0
  51. data/utils/subclades.R +171 -0
  52. metadata +185 -0
@@ -0,0 +1,105 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $DATASET, $RUNTYPE, $MIGA, $CORES, $TMPDIR,
3
+ # $NOMULTI, $REF
4
+
5
+ function checkpoint_n {
6
+ if [[ $N -eq 10 ]] ; then
7
+ for t in 01.haai 02.aai 03.ani ; do
8
+ if [[ -s $TMPDIR/$t.db ]] ; then
9
+ tab="aai"
10
+ [[ "$t" == "03.ani" ]] && tab="ani"
11
+ echo "select count(*) from $tab;" \
12
+ | sqlite3 $TMPDIR/$t.db\
13
+ || exit 1
14
+ cp $TMPDIR/$t.db $t/$DATASET.db
15
+ fi
16
+ done
17
+ N=0
18
+ fi
19
+ let N=$N+1
20
+ }
21
+
22
+ ESS="../07.annotation/01.function/01.essential"
23
+
24
+ # Initialize temporals
25
+ for t in 01.haai 02.aai 03.ani ; do
26
+ [[ -s $t/$DATASET.db ]] && cp $t/$DATASET.db $TMPDIR/$t.db
27
+ done
28
+ echo "create table if not exists aai(seq1 varchar(256), seq2 varchar(256)," \
29
+ "aai float, sd float, n int, omega int);" | sqlite3 $TMPDIR/02.aai.db
30
+ N=1
31
+
32
+ # Traverse "nearly-half" of the ref-datasets using first-come-first-served
33
+ for i in $(miga list_datasets -P "$PROJECT" --ref --no-multi) ; do
34
+ echo "=[ $i ]"
35
+ date "+%Y-%m-%d %H:%M:%S %z"
36
+ HAAI=""; AAI=""; ANI="";
37
+ # Check if the i-th dataset is ready
38
+ [[ -s $ESS/$i.done && -s $ESS/$i.json ]] || continue
39
+ # Check if this is done (e.g., in a previous failed iteration)
40
+ AAI=$(echo "select aai from aai where seq1='$DATASET' and seq2='$i';" \
41
+ | sqlite3 $TMPDIR/02.aai.db || echo "")
42
+ # Try the other direction
43
+ if [[ "$AAI" == "" && -s 02.aai/$i.db ]] ; then
44
+ cp "02.aai/$i.db" "$TMPDIR/$i.db"
45
+ AAI=$(echo "select aai from aai where seq2='$DATASET' and seq1='$i';" \
46
+ | sqlite3 "$TMPDIR/$i.db" || echo "")
47
+ rm "$TMPDIR/$i.db"
48
+ fi
49
+ # Try with hAAI
50
+ if [[ "$AAI" == "" ]] ; then
51
+ [[ -e "$TMPDIR/$DATASET.ess.faa" ]] \
52
+ || cp $ESS/$DATASET.ess.faa $TMPDIR/$DATASET.ess.faa
53
+ HAAI=$(aai.rb -1 $TMPDIR/$DATASET.ess.faa -2 $ESS/$i.ess.faa \
54
+ -t $CORES -a -n 10 -S $TMPDIR/01.haai.db --name1 $DATASET \
55
+ --name2 $i --lookup-first --no-save-rbm || echo "")
56
+ if [[ "$HAAI" != "" \
57
+ && $(perl -MPOSIX -e "print floor $HAAI") -lt 90 ]] ; then
58
+ AAI=$(perl -e \
59
+ "printf '%f', 100-exp(2.435076 + 0.4275193*log(100-$HAAI))")
60
+ echo "insert into aai values('$DATASET','$i','$AAI',0,0,0);" \
61
+ | sqlite3 $TMPDIR/02.aai.db
62
+ fi
63
+ fi
64
+ # Try with complete AAI
65
+ if [[ "$AAI" == "" ]] ; then
66
+ [[ -e "$TMPDIR/$DATASET.faa" ]] \
67
+ || cp ../06.cds/$DATASET.faa $TMPDIR/$DATASET.faa
68
+ AAI=$(aai.rb -1 $TMPDIR/$DATASET.faa -2 ../06.cds/$i.faa -t $CORES -a \
69
+ -S $TMPDIR/02.aai.db --name1 $DATASET --name2 $i --lookup-first \
70
+ || echo "")
71
+ fi
72
+ date "+%Y-%m-%d %H:%M:%S %z"
73
+ # Check if ANI is meaningful
74
+ if [[ -e "../05.assembly/$DATASET.LargeContigs.fna" \
75
+ && -e "../05.assembly/$i.LargeContigs.fna" \
76
+ && $(perl -MPOSIX -e "print ceil $AAI") -gt 90 ]] ; then
77
+ # Check if this is done (e.g., in a previous failed iteration)
78
+ ANI=$(echo "select ani from ani where seq1='$DATASET' and seq2='$i';" \
79
+ | sqlite3 $TMPDIR/03.ani.db || echo "")
80
+ # Try the other direction
81
+ if [[ "$ANI" == "" && -s 03.ani/$i.db ]] ; then
82
+ cp "03.ani/$i.db" "$TMPDIR/$i.db"
83
+ ANI=$(echo "select ani from ani" \
84
+ "where seq2='$DATASET' and seq1='$i';" \
85
+ | sqlite3 "$TMPDIR/$i.db" || echo "")
86
+ rm "$TMPDIR/$i.db"
87
+ fi
88
+ # Calculate it
89
+ if [[ "$ANI" == "" ]] ; then
90
+ [[ -e "$TMPDIR/$DATASET.LargeContigs.fna" ]] \
91
+ || cp ../05.assembly/$DATASET.LargeContigs.fna \
92
+ $TMPDIR/$DATASET.LargeContigs.fna
93
+ ANI=$(ani.rb -1 $TMPDIR/$DATASET.LargeContigs.fna \
94
+ -2 ../05.assembly/$i.LargeContigs.fna -t $CORES \
95
+ -S $TMPDIR/03.ani.db -a --name1 $DATASET --name2 $i \
96
+ --no-save-regions --no-save-rbm --lookup-first \
97
+ || echo "")
98
+ fi
99
+ fi
100
+ echo "$AAI;$ANI"
101
+ checkpoint_n
102
+ done
103
+ N=10
104
+ checkpoint_n
105
+
@@ -0,0 +1,40 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
3
+ echo "MiGA: $MIGA"
4
+ echo "Project: $PROJECT"
5
+ source "$MIGA/scripts/miga.bash" || exit 1
6
+ cd "$PROJECT/data/09.distances/02.aai"
7
+
8
+ # Initialize
9
+ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
10
+
11
+ echo -n "" > miga-project.log
12
+ DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
13
+
14
+ # Extract values
15
+ echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
16
+ for i in $DS ; do
17
+ echo "SELECT CASE WHEN omega!=0 THEN 'AAI' ELSE 'hAAI_AAI' END," \
18
+ " seq1, seq2, aai, sd, n, omega from aai;" \
19
+ | sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
20
+ echo "$i" >> miga-project.log
21
+ done
22
+
23
+ # R-ify
24
+ echo "
25
+ aai <- read.table('miga-project.txt', sep='\\t', h=T);
26
+ save(aai, file='miga-project.Rdata');
27
+ h <- hist(aai[,'value'], breaks=100, plot=FALSE);
28
+ write.table(
29
+ cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
30
+ file='miga-project.hist', quote=FALSE, sep='\\t',
31
+ col.names=FALSE, row.names=FALSE);
32
+ " | R --vanilla
33
+
34
+ # Gzip
35
+ gzip -9 -f miga-project.txt
36
+
37
+ # Finalize
38
+ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
39
+ miga add_result -P "$PROJECT" -r aai_distances
40
+
@@ -0,0 +1,39 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
3
+ echo "MiGA: $MIGA"
4
+ echo "Project: $PROJECT"
5
+ source "$MIGA/scripts/miga.bash" || exit 1
6
+ cd "$PROJECT/data/09.distances/03.ani"
7
+
8
+ # Initialize
9
+ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
10
+
11
+ echo -n "" > miga-project.log
12
+ DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
13
+
14
+ # Extract values
15
+ echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
16
+ for i in $DS ; do
17
+ echo "SELECT 'ANI', seq1, seq2, ani, sd, n, omega from ani ;" \
18
+ | sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
19
+ echo "$i" >> miga-project.log
20
+ done
21
+
22
+ # R-ify
23
+ echo "
24
+ ani <- read.table('miga-project.txt', sep='\\t', h=T);
25
+ save(ani, file='miga-project.Rdata');
26
+ h <- hist(ani[,'value'], breaks=100, plot=FALSE);
27
+ write.table(
28
+ cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
29
+ file='miga-project.hist', quote=FALSE, sep='\\t',
30
+ col.names=FALSE, row.names=FALSE);
31
+ " | R --vanilla
32
+
33
+ # Gzip
34
+ gzip -9 -f miga-project.txt
35
+
36
+ # Finalize
37
+ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
38
+ miga add_result -P "$PROJECT" -r ani_distances
39
+
@@ -0,0 +1,38 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
3
+ echo "MiGA: $MIGA"
4
+ echo "Project: $PROJECT"
5
+ source "$MIGA/scripts/miga.bash" || exit 1
6
+ cd "$PROJECT/data/05.assembly"
7
+
8
+ b=$DATASET
9
+
10
+ # Initialize
11
+ date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
12
+
13
+ # Assemble
14
+ FA="../04.trimmed_fasta/$DATASET.CoupledReads.fa"
15
+ [[ -e $FA ]] || FA="$FA.gz"
16
+ [[ -e $FA ]] || FA="../04.trimmed_fasta/$DATASET.SingleReads.fa"
17
+ [[ -e $FA ]] || FA="$FA.gz"
18
+ idba_ud --pre_correction -r "$FA" -o "$DATASET" --num_threads "$CORES"
19
+
20
+ # Clean
21
+ cd $DATASET
22
+ rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa
23
+ cd ..
24
+
25
+ # Extract
26
+ if [[ -s $DATASET/scaffold.fa ]] ; then
27
+ ln -s $DATASET/scaffold.fa $DATASET.AllContigs.fna
28
+ else
29
+ ln -s $DATASET/contig.fa $DATASET.AllContigs.fna
30
+ fi
31
+ FastA.length.pl $DATASET.AllContigs.fna | awk '$2>=1000{print $1}' \
32
+ | FastA.filter.pl /dev/stdin $DATASET.AllContigs.fna \
33
+ > $DATASET.LargeContigs.fna
34
+
35
+ # Finalize
36
+ date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
37
+ miga add_result -P "$PROJECT" -D "$DATASET" -r assembly
38
+
data/scripts/cds.bash ADDED
@@ -0,0 +1,45 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
3
+ echo "MiGA: $MIGA"
4
+ echo "Project: $PROJECT"
5
+ source "$MIGA/scripts/miga.bash" || exit 1
6
+ cd "$PROJECT/data/06.cds"
7
+
8
+ # Initialize
9
+ date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
10
+ GM=$(dirname -- $(which gmhmmp))
11
+
12
+ # Register key
13
+ if [[ ! -e .gm_key ]] ; then
14
+ if [[ -e "$GM/gm_key" ]] ; then
15
+ cp "$GM/gm_key" ".gm_key"
16
+ elif [[ -e "$GM/gm_key_64" ]] ; then
17
+ cp "$GM/gm_key_64" ".gm_key"
18
+ elif [[ -e "$GM/gm_key_32" ]] ; then
19
+ cp "$GM/gm_key_32" ".gm_key"
20
+ elif [[ -e "$GM/.gm_key" ]] ; then
21
+ cp "$GM/.gm_key" ".gm_key"
22
+ elif [[ -e "$HOME/.gm_key" ]] ; then
23
+ cp "$HOME/.gm_key" .
24
+ else
25
+ echo "Impossible to find MetaGeneMark key, please register your copy" \
26
+ "and place the key in '$GM/gm_key'." >&2
27
+ exit 1
28
+ fi
29
+ fi
30
+
31
+ # Run MetaGeneMark
32
+ gmhmmp -a -d -m "$GM/MetaGeneMark_v1.mod" -f G -o "$DATASET.gff2" \
33
+ "../05.assembly/$DATASET.LargeContigs.fna"
34
+
35
+ # Extract
36
+ perl "$GM/aa_from_gff.pl" < "$DATASET.gff2" > "$DATASET.faa"
37
+ perl "$GM/nt_from_gff.pl" < "$DATASET.gff2" > "$DATASET.fna"
38
+
39
+ # Gzip
40
+ gzip -9 -f "$DATASET.gff2"
41
+
42
+ # Finalize
43
+ date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
44
+ miga add_result -P "$PROJECT" -D "$DATASET" -r cds
45
+
@@ -0,0 +1,27 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
3
+ echo "MiGA: $MIGA"
4
+ echo "Project: $PROJECT"
5
+ source "$MIGA/scripts/miga.bash" || exit 1
6
+ cd "$PROJECT/data/10.clades/01.find"
7
+
8
+ # Initialize
9
+ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
10
+
11
+ # Markov-cluster genomes by ANI
12
+ gunzip -c ../../09.distances/03.ani/miga-project.txt.gz | tail -n+2 \
13
+ | awk -F"\\t" '{print $2"'"\\t"'"$3"'"\\t"'"$4}' > genome-genome.aai90.rbm
14
+ ogs.mcl.rb -d . -o miga-project.ani-clades -t "$CORES" -i \
15
+ -f "(\\S+)-(\\S+)\\.aai90\\.rbm"
16
+ cat genome-genome.aai90.rbm | awk -F"\\t" '$3>=95' > genome-genome.ani95.rbm
17
+ ogs.mcl.rb -d . -o miga-project.ani95-clades -t "$CORES" -b \
18
+ -f "(\\S+)-(\\S+)\\.ani95\\.rbm"
19
+
20
+ # Propose clade projects
21
+ cat miga-project.ani95-clades | tail -n +2 | tr "," "\\t" | awk 'NF >= 5' \
22
+ > miga-project.proposed-clades
23
+
24
+ # Finalize
25
+ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
26
+ miga add_result -P "$PROJECT" -r clade_finding
27
+
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $DATASET, $RUNTYPE, $MIGA, $CORES
3
+ echo "MiGA: $MIGA"
4
+ echo "Project: $PROJECT"
5
+ source "$MIGA/scripts/miga.bash" || exit 1
6
+ cd "$PROJECT/data/09.distances"
7
+
8
+ # Initialize
9
+ date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
10
+ TMPDIR=$(mktemp -d /tmp/MiGA.XXXXXXXXXXXX)
11
+ trap "rm -rf $TMPDIR; exit" SIGHUP SIGINT SIGTERM
12
+
13
+ # Check type of dataset
14
+ NOMULTI=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --no-multi \
15
+ | wc -l | awk '{print $1}')
16
+ REF=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --ref \
17
+ | wc -l | awk '{print $1}')
18
+
19
+ # Call submodule
20
+ if [[ "$NOMULTI" -eq "1" && "$REF" -eq "1" ]] ; then
21
+ source "$MIGA/scripts/_distances_ref_nomulti.bash"
22
+ elif [[ "$NOMULTI" -eq "1" ]] ; then
23
+ source "$MIGA/scripts/_distances_noref_nomulti.bash"
24
+ fi
25
+
26
+ # Finalize
27
+ rm -R $TMPDIR
28
+ date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
29
+ miga add_result -P "$PROJECT" -D "$DATASET" -r distances
30
+
@@ -0,0 +1,29 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
3
+ echo "MiGA: $MIGA"
4
+ echo "Project: $PROJECT"
5
+ source "$MIGA/scripts/miga.bash" || exit 1
6
+ cd "$PROJECT/data/07.annotation/01.function/01.essential"
7
+
8
+ # Initialize
9
+ date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.start"
10
+
11
+ # Find and extract essential genes
12
+ [[ -d "$DATASET.ess" ]] && rm -R "$DATASET.ess"
13
+ mkdir "$DATASET.ess"
14
+ TYPE=$(miga list_datasets -P "$PROJECT" -D "$DATASET" \
15
+ --metadata "type" | awk '{print $2}')
16
+ if [[ "$TYPE" == "metagenome" || "$TYPE" == "virome" ]] ; then
17
+ HMM.essential.rb -i "../../../06.cds/$DATASET.faa" -o "$DATASET.ess.faa" \
18
+ -m "$DATASET.ess/" -t "$CORES" -r "$DATASET" --metagenome \
19
+ > "$DATASET.ess/log"
20
+ else
21
+ HMM.essential.rb -i "../../../06.cds/$DATASET.faa" -o "$DATASET.ess.faa" \
22
+ -m "$DATASET.ess/" -t "$CORES" -r "$DATASET" \
23
+ > "$DATASET.ess/log"
24
+ fi
25
+
26
+ # Finalize
27
+ date "+%Y-%m-%d %H:%M:%S %z" > "$DATASET.done"
28
+ miga add_result -P "$PROJECT" -D "$DATASET" -r essential
29
+
@@ -0,0 +1,39 @@
1
+ #!/bin/bash
2
+ # Available variables: $PROJECT, $RUNTYPE, $MIGA, $CORES
3
+ echo "MiGA: $MIGA"
4
+ echo "Project: $PROJECT"
5
+ source "$MIGA/scripts/miga.bash" || exit 1
6
+ cd "$PROJECT/data/09.distances/01.haai"
7
+
8
+ # Initialize
9
+ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.start"
10
+
11
+ echo -n "" > miga-project.log
12
+ DS=$(miga list_datasets -P "$PROJECT" --ref --no-multi)
13
+
14
+ # Extract values
15
+ echo "metric a b value sd n omega" | tr " " "\\t" >miga-project.txt
16
+ for i in $DS ; do
17
+ echo "SELECT 'hAAI', seq1, seq2, aai, sd, n, omega from aai ;" \
18
+ | sqlite3 "$i.db" | tr "\\|" "\\t" >>miga-project.txt
19
+ echo "$i" >> miga-project.log
20
+ done
21
+
22
+ # R-ify
23
+ echo "
24
+ haai <- read.table('miga-project.txt', sep='\\t', h=T);
25
+ save(haai, file='miga-project.Rdata');
26
+ h <- hist(haai[,'value'], breaks=100, plot=FALSE);
27
+ write.table(
28
+ cbind(h[['breaks']][-length(h[['breaks']])],h[['breaks']][-1],h[['counts']]),
29
+ file='miga-project.hist', quote=FALSE, sep='\\t',
30
+ col.names=FALSE, row.names=FALSE);
31
+ " | R --vanilla
32
+
33
+ # Gzip
34
+ gzip -9 -f miga-project.txt
35
+
36
+ # Finalize
37
+ date "+%Y-%m-%d %H:%M:%S %z" > "miga-project.done"
38
+ miga add_result -P "$PROJECT" -r haai_distances
39
+
data/scripts/init.bash ADDED
@@ -0,0 +1,211 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ #=======[ Functions ]
5
+ function ask_user {
6
+ local question=$1
7
+ local default=$2
8
+ echo $question >&2
9
+ echo -n " [$default] > " >&2
10
+ read user_answer
11
+ user_answer=${user_answer:-$default}
12
+ echo -n "$user_answer"
13
+ }
14
+
15
+ function check_req {
16
+ local bin=$1
17
+ local default=$(dirname "$(which "$bin")")
18
+ user_answer=$(ask_user "Where can I find $2 ($3)? $4" "$default")
19
+ if [[ -x "$user_answer/$bin" ]] ; then
20
+ export PATH="$PATH:$user_answer"
21
+ echo "MIGA_PATH=\"$user_answer:\$MIGA_PATH\" # $2" >> "$HOME/.miga_rc"
22
+ else
23
+ echo "Cannot find $2 at '$user_answer/$bin'. Aborting..." >&2
24
+ exit 1
25
+ fi
26
+ }
27
+
28
+ function check_rlib {
29
+ local rlib=$1
30
+ gotit=$(echo "if(require($rlib)) cat('GOT','IT')" | R --vanilla -q 2>&1 \
31
+ | grep "GOT IT" | wc -l | awk '{print $1}')
32
+ [[ "$gotit" == "1" ]]
33
+ }
34
+
35
+ function check_gem {
36
+ local gem=$1
37
+ gotit=$(echo "require '$gem'" | ruby 2>/dev/null && echo 1)
38
+ [[ "$gotit" == "1" ]]
39
+ }
40
+
41
+ #=======[ Main ]
42
+ MIGA_STARTUP="no"
43
+ MIGA=$(cd "$(dirname "$0")/.."; pwd)
44
+ echo "
45
+ ===[ Welcome to MiGA, the Microbial Genome Atlas ]===
46
+
47
+ I'm the initialization script, and I'll sniff around your computer to
48
+ make sure you have all the requirements for MiGA Daemons.
49
+ " >&2
50
+
51
+ if [[ "$(ask_user "Would you like to see all the requirements before starting? (yes / no)" "no")" == "yes" ]] ; then
52
+ echo "" >&2
53
+ cat "$MIGA/utils/requirements.txt" >&2
54
+ echo "" >&2
55
+ fi
56
+
57
+ if [[ -e "$HOME/.miga_rc" ]] ; then
58
+ case "$(ask_user "I found a previous configuration. Do you want to load the defaults within? (yes / no / cancel)" "yes")" in
59
+ yes)
60
+ source "$HOME/.miga_rc"
61
+ if [[ "$MIGA_CONFIG_DATE" == "" ]] ; then
62
+ echo " Loaded incomplete configuration" >&2
63
+ else
64
+ echo " Loaded configuration from $MIGA_CONFIG_DATE" >&2
65
+ fi
66
+ ;;
67
+ no)
68
+ rm "$HOME/.miga_rc"
69
+ ;;
70
+ cancel)
71
+ exit 0
72
+ ;;
73
+ *)
74
+ echo "Cannot understand your answer, please use 'yes', 'no', or 'cancel'. Aborting..." >&2
75
+ exit 1
76
+ esac
77
+ fi
78
+
79
+ echo "#!/bin/bash
80
+ # MiGA made this on $(date)
81
+ " > "$HOME/.miga_rc"
82
+
83
+ # Check Software requirements
84
+ MIGA_STARTUP=$(ask_user "Is there a script I need to load at startup? (no / path to the script to load)" "$MIGA_STARTUP")
85
+ if [[ "$MIGA_STARTUP" != "no" ]] ; then
86
+ echo "MIGA_STARTUP='$MIGA_STARTUP'
87
+ source \"\$MIGA_STARTUP\"
88
+ " >> "$HOME/.miga_rc";
89
+ source "$MIGA_STARTUP";
90
+ fi
91
+ echo "
92
+ Looking for Software requirements:" >&2
93
+ reqs=$(tail -n+3 "$MIGA/utils/requirements.txt" | perl -pe 's/\t+/\t/g')
94
+ IFS_BU=$IFS
95
+ IFS=$'\n'
96
+ for ln in $reqs ; do
97
+ rname=$(echo "$ln" | awk -F'\t' '{print $1}')
98
+ rtest=$(echo "$ln" | awk -F'\t' '{print $2}')
99
+ rwebs=$(echo "$ln" | awk -F'\t' '{print $3}')
100
+ rhint=$(echo "$ln" | awk -F'\t' '{print $4}')
101
+ check_req "$rtest" "$rname" "$rwebs" "$rhint"
102
+ done
103
+ IFS=$IFS_BU
104
+ echo "export PATH=\$MIGA_PATH\$PATH" >> "$HOME/.miga_rc"
105
+
106
+ # Check for R packages
107
+ echo "
108
+ Looking for R packages:" >&2
109
+ if ! check_rlib enveomics.R ; then
110
+ echo "+ Installing enveomics.R" >&2
111
+ R CMD INSTALL $(dirname "$(which "FastQ.tag.rb")")/../enveomics.R
112
+ fi
113
+ RLIBS="ape ggdendro ggplot2 gridExtra cluster dendextend vegan scatterplot3d"
114
+ for lib in $RLIBS ; do
115
+ if ! check_rlib $lib ; then
116
+ echo "+ Installing $lib" >&2
117
+ echo "install.packages('$lib', repos='http://cran.rstudio.com/')" \
118
+ | R --vanilla -q
119
+ fi
120
+ done
121
+
122
+ # Check for ruby gems
123
+ echo "
124
+ Looking for Ruby gems:" >&2
125
+ GEMS="rest_client sqlite3 daemons json"
126
+ for gem in $GEMS ; do
127
+ if ! check_gem $gem ; then
128
+ echo "+ Installing $gem (user-only)" >&2
129
+ gem install --user $gem
130
+ fi
131
+ done
132
+
133
+ # Check for other files
134
+ echo "
135
+ Looking for additional files:
136
+ + MetaGeneMark license key" >&2
137
+ GM=$(dirname -- $(which gmhmmp))
138
+ if [[ ! -e "$GM/gm_key" && ! -e "$GM/gm_key_64" && ! -e "$GM/gm_key_32" && ! -e "$GM/.gm_key" && ! -e "$HOME/.gm_key" ]] ; then
139
+ echo "Cannot find it, please place your license key in '$GM/gm_key'. Aborting..." >&2
140
+ exit 1
141
+ fi
142
+ echo "+ MetaGeneMark scripts" >&2
143
+ if [[ ! -e "$GM/aa_from_gff.pl" || ! -e "$GM/nt_from_gff.pl" ]] ; then
144
+ echo "Cannot find it, please place aa_from_gff.pl and nt_from_gff.pl in '$GM/'. Aborting..." >&2
145
+ exit 1
146
+ fi
147
+ echo "+ MyTaxa scores database" >&2
148
+ MT=$(dirname -- $(which MyTaxa))
149
+ if [[ ! -d "$MT/db" ]] ; then
150
+ echo "Cannot find it, please execute 'python $MT/utils/download_db.py'. Aborting..." >&2
151
+ exit 1
152
+ fi
153
+ echo "+ MyTaxa DIAMOND database" >&2
154
+ if [[ ! -e "$MT/AllGenomes.faa.dmnd" ]] ; then
155
+ echo "Cannot find it, please download 'http://enve-omics.ce.gatech.edu/data/public_mytaxa/AllGenomes.faa.dmnd' into '$MT'. Aborting..." >&2
156
+ exit 1
157
+ fi
158
+
159
+ # Configure daemon
160
+ echo "
161
+ Default daemon configuration:" >&2
162
+ dtype=$(ask_user "Please select the type of daemon you want to setup (bash / qsub / msub)", "bash")
163
+ case "$dtype" in
164
+ bash)
165
+ dlatency=$(ask_user "For how long should I sleep? (# in seconds)" "30")
166
+ dmaxjobs=$(ask_user "How many jobs can I launch at once?" "6")
167
+ dppn=$(ask_user "How many CPUs can I use per job?" "2")
168
+ echo "Setting up internal daemon defaults, if you don't understand this just leave default values:" >&2
169
+ dcmd=$(ask_user "How should I launch tasks? Use %1\$s for script path, %2\$s for variables, %3\$d for CPUs, %4\$s for log file, and %5\$s for task name." "%2\$s '%1\$s' &> '%4\$s'")
170
+ dvar=$(ask_user "How should I pass variables? Use %1\$s for keys and %2\$s for values." "%1\$s=%2\$s")
171
+ dsep=$(ask_user "What should I use to separate variables?" " ")
172
+ dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for PID, output should be 1 for running and 0 for non-running." "ps -p '%1\$s'|tail -n+2|wc -l|awk '{print \$1}'")
173
+ ;;
174
+ [qm]sub)
175
+ dqueue=$(ask_user "What's the name of the queue I should use?" "")
176
+ dlatency=$(ask_user "How long should I sleep? (# in seconds)" "150")
177
+ dmaxjobs=$(ask_user "How many jobs can I launch at once?" "300")
178
+ dppn=$(ask_user "How many CPUs can I use per job?" "4")
179
+ echo "Setting up internal daemon defaults, if you don't understand this just leave default values:" >&2
180
+ dcmd=$(ask_user "How should I launch tasks? Use %1\$s for script path, %2\$s for variables, and %3\$d for CPUs, %4\$d for log file, and %5\$s for task name." \
181
+ "$dtype -q '$dqueue' -v '%2\$s' -l nodes=1:ppn=%3\$d %1\$s -j oe -o '%4\$s' -N '%5\$s' | grep .")
182
+ dvar=$(ask_user "How should I pass variables? Use %1\$s for keys and %2\$s for values." "%1\$s=%2\$s")
183
+ dsep=$(ask_user "What should I use to separate variables?" ",")
184
+ if [[ "$dtype" == "qsub" ]] ; then
185
+ dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for job id, output should be 1 for running and 0 for non-running." \
186
+ "qstat -f '%1\$s'|grep ' job_state ='|perl -pe 's/.*= //'|grep '[^C]'|tail -n1|wc -l|awk '{print \$1}'")
187
+ else
188
+ dalive=$(ask_user "How can I know that a process is still alive? Use %1\$s for job id, output should be 1 for running and 0 for non-running." \
189
+ "checkjob '%1\$s'|grep '^State:'|perl -pe 's/.*: //'|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'|tail -n1|wc -l|awk '{print \$1}'")
190
+ fi
191
+ ;;
192
+ *)
193
+ esac
194
+ echo "{
195
+ \"created\": \"$(date "+%Y-%m-%d %H:%M:%S %z")\",
196
+ \"updated\": \"$(date "+%Y-%m-%d %H:%M:%S %z")\",
197
+ \"type\" : \"$dtype\",
198
+ \"cmd\" : \"$dcmd\",
199
+ \"var\" : \"$dvar\",
200
+ \"varsep\" : \"$dsep\",
201
+ \"alive\" : \"$dalive\",
202
+ \"latency\": $dlatency,
203
+ \"maxjobs\": $dmaxjobs,
204
+ \"ppn\" : $dppn
205
+ }" > $HOME/.miga_daemon.json
206
+
207
+ # Confirm configuration
208
+ echo "
209
+ MIGA_CONFIG_DATE='$(date "+%Y-%m-%d %H:%M:%S %z")'
210
+ " >> "$HOME/.miga_rc"
211
+
data/scripts/miga.bash ADDED
@@ -0,0 +1,12 @@
1
+ #!/bin/bash
2
+ set -e
3
+ #MIGA=${MIGA:-$(cd "$(dirname "$0")/.."; pwd)}
4
+ source "$HOME/.miga_rc"
5
+ export PATH="$MIGA/bin:$PATH"
6
+
7
+ function exists { [[ -e "$1" ]] ; }
8
+
9
+ #if [[ "$RUNTYPE" == "qsub" ]] ; then
10
+ #elif [[ "$RUNTYPE" == "msub" ]] ; then
11
+ #fi
12
+