miga-base 0.7.26.3 → 1.0.0.sr1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/doctor.rb +50 -19
- data/lib/miga/cli/action/doctor/base.rb +20 -18
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +1 -2
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +11 -6
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +7 -0
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +33 -6
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
data/lib/miga/project/result.rb
CHANGED
@@ -55,12 +55,12 @@ module MiGA::Project::Result
|
|
55
55
|
##
|
56
56
|
# Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
|
57
57
|
def add_result_distances(base, _opts)
|
58
|
-
return nil unless result_files_exist?(base, %w[.Rdata .
|
58
|
+
return nil unless result_files_exist?(base, %w[.Rdata .txt])
|
59
59
|
|
60
60
|
r = MiGA::Result.new("#{base}.json")
|
61
61
|
r.add_file(:rdata, 'miga-project.Rdata')
|
62
62
|
r.add_file(:matrix, 'miga-project.txt')
|
63
|
-
r.add_file(:log, 'miga-project.log')
|
63
|
+
r.add_file(:log, 'miga-project.log') # Legacy file
|
64
64
|
r.add_file(:hist, 'miga-project.hist')
|
65
65
|
r
|
66
66
|
end
|
data/lib/miga/sqlite.rb
CHANGED
@@ -37,6 +37,7 @@ class MiGA::SQLite < MiGA::MiGA
|
|
37
37
|
# Executes +cmd+ and returns the result
|
38
38
|
def run(*cmd)
|
39
39
|
busy_attempts ||= 0
|
40
|
+
io_attempts ||= 0
|
40
41
|
y = nil
|
41
42
|
SQLite3::Database.new(path) { |conn| y = conn.execute(*cmd) }
|
42
43
|
y
|
@@ -44,6 +45,12 @@ class MiGA::SQLite < MiGA::MiGA
|
|
44
45
|
busy_attempts += 1
|
45
46
|
raise "Database busy #{path}: #{e.message}" if busy_attempts >= 3
|
46
47
|
|
48
|
+
sleep(1)
|
49
|
+
retry
|
50
|
+
rescue SQLite3::IOException => e
|
51
|
+
io_attempts += 1
|
52
|
+
raise "Database I/O error #{path}: #{e.message}" if io_attempts >= 3
|
53
|
+
|
47
54
|
sleep(1)
|
48
55
|
retry
|
49
56
|
end
|
data/lib/miga/version.rb
CHANGED
@@ -9,23 +9,33 @@ module MiGA
|
|
9
9
|
# Current version of MiGA. An Array with three values:
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
|
-
# -
|
13
|
-
|
12
|
+
# - String indicating release status:
|
13
|
+
# - rc* release candidate, not released as gem
|
14
|
+
# - sr* stable release, released as gem
|
15
|
+
VERSION = [1.0, 0, 'sr1'].freeze
|
14
16
|
|
15
17
|
##
|
16
18
|
# Nickname for the current major.minor version.
|
17
|
-
VERSION_NAME = '
|
19
|
+
VERSION_NAME = 'prima'
|
18
20
|
|
19
21
|
##
|
20
22
|
# Date of the current gem release.
|
21
|
-
VERSION_DATE = Date.new(2021,
|
23
|
+
VERSION_DATE = Date.new(2021, 4, 12)
|
22
24
|
|
23
25
|
##
|
24
|
-
#
|
25
|
-
CITATION =
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
# References of MiGA
|
27
|
+
CITATION = []
|
28
|
+
CITATION << <<~REF
|
29
|
+
Rodriguez-R et al (2018). The Microbial Genomes Atlas (MiGA) webserver:
|
30
|
+
taxonomic and gene diversity analysis of Archaea and Bacteria at the whole
|
31
|
+
genome level. Nucleic Acids Research 46(W1):W282-W288.
|
32
|
+
doi:10.1093/nar/gky467.
|
33
|
+
REF
|
34
|
+
CITATION << <<~REF
|
35
|
+
Rodriguez-R et al (2020). Classifying prokaryotic genomes using the
|
36
|
+
Microbial Genomes Atlas (MiGA) webserver. Bergey's Manual of Systematics
|
37
|
+
of Archaea and Bacteria.
|
38
|
+
REF
|
29
39
|
end
|
30
40
|
|
31
41
|
class MiGA::MiGA
|
@@ -58,6 +68,10 @@ class MiGA::MiGA
|
|
58
68
|
##
|
59
69
|
# Reference of MiGA
|
60
70
|
def self.CITATION
|
71
|
+
CITATION.map { |i| "- #{i}" }.join
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.CITATION_ARRAY
|
61
75
|
CITATION
|
62
76
|
end
|
63
77
|
end
|
data/scripts/aai_distances.bash
CHANGED
@@ -9,34 +9,32 @@ DIR="$PROJECT/data/09.distances/02.aai"
|
|
9
9
|
# Initialize
|
10
10
|
miga_start_project_step "$DIR"
|
11
11
|
|
12
|
-
echo -n "" > miga-project.log
|
13
|
-
DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
14
|
-
|
15
12
|
# Extract values
|
16
13
|
rm -f miga-project.txt
|
14
|
+
SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
|
15
|
+
DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
17
16
|
(
|
18
|
-
echo "
|
17
|
+
echo "a b value sd n omega" | tr " " "\\t"
|
19
18
|
for i in $DS ; do
|
20
|
-
echo "
|
21
|
-
" seq1, seq2, aai, sd, n, omega from aai;" \
|
22
|
-
| sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
23
|
-
echo "$i" >> miga-project.log
|
19
|
+
echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
24
20
|
done
|
25
21
|
) | gzip -9c > miga-project.txt.gz
|
26
22
|
|
27
23
|
# R-ify
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
24
|
+
cat <<R | R --vanilla
|
25
|
+
file <- gzfile('miga-project.txt.gz')
|
26
|
+
aai <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
|
27
|
+
save(aai, file = 'miga-project.Rdata')
|
28
|
+
if(sum(aai[, 'a'] != aai[, 'b']) > 0) {
|
29
|
+
h <- hist(aai[aai[, 'a'] != aai[, 'b'], 'value'], breaks = 100, plot = FALSE)
|
30
|
+
len <- length(h[['breaks']])
|
33
31
|
write.table(
|
34
|
-
cbind(h[['breaks']][-
|
35
|
-
|
36
|
-
|
37
|
-
|
32
|
+
cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
|
33
|
+
file = 'miga-project.hist', quote = FALSE, sep = '\t',
|
34
|
+
col.names = FALSE, row.names = FALSE
|
35
|
+
)
|
38
36
|
}
|
39
|
-
|
37
|
+
R
|
40
38
|
|
41
39
|
# Finalize
|
42
40
|
miga_end_project_step "$DIR"
|
data/scripts/ani_distances.bash
CHANGED
@@ -9,33 +9,32 @@ DIR="$PROJECT/data/09.distances/03.ani"
|
|
9
9
|
# Initialize
|
10
10
|
miga_start_project_step "$DIR"
|
11
11
|
|
12
|
-
echo -n "" > miga-project.log
|
13
|
-
DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
14
|
-
|
15
12
|
# Extract values
|
16
13
|
rm -f miga-project.txt
|
14
|
+
SQL="SELECT seq1, seq2, ani, sd, n, omega from ani;"
|
15
|
+
DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
|
17
16
|
(
|
18
|
-
echo "
|
17
|
+
echo "a b value sd n omega" | tr " " "\\t"
|
19
18
|
for i in $DS ; do
|
20
|
-
echo "
|
21
|
-
| sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
22
|
-
echo "$i" >> miga-project.log
|
19
|
+
echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
23
20
|
done
|
24
21
|
) | gzip -9c > miga-project.txt.gz
|
25
22
|
|
26
23
|
# R-ify
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
24
|
+
cat <<R | R --vanilla
|
25
|
+
file <- gzfile('miga-project.txt.gz')
|
26
|
+
ani <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
|
27
|
+
save(ani, file = 'miga-project.Rdata')
|
28
|
+
if(sum(ani[, 'a'] != ani[, 'b']) > 0) {
|
29
|
+
h <- hist(ani[ani[, 'a'] != ani[, 'b'], 'value'], breaks = 100, plot = FALSE)
|
30
|
+
len <- length(h[['breaks']])
|
32
31
|
write.table(
|
33
|
-
cbind(h[['breaks']][-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
|
33
|
+
file = 'miga-project.hist', quote = FALSE, sep = '\t',
|
34
|
+
col.names = FALSE, row.names = FALSE
|
35
|
+
)
|
37
36
|
}
|
38
|
-
|
37
|
+
R
|
39
38
|
|
40
39
|
# Finalize
|
41
40
|
miga_end_project_step "$DIR"
|
data/scripts/assembly.bash
CHANGED
@@ -11,30 +11,44 @@ miga date > "$DATASET.start"
|
|
11
11
|
|
12
12
|
# Interpose (if needed)
|
13
13
|
TF="../04.trimmed_fasta"
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
b=$DATASET
|
15
|
+
if [[ -s "$TF/${b}.2.fasta" || -s "$TF/${b}.2.fasta.gz" ]] ; then
|
16
|
+
cr="$TF/${b}.CoupledReads.fa"
|
17
|
+
if [[ ! -s "$cr" && ! -s "${cr}.gz" ]] ; then
|
18
|
+
for s in 1 2 ; do
|
19
|
+
if [[ -s "$TF/${b}.${s}.fasta" ]] ; then
|
20
|
+
ln -s "$TF/${b}.${s}.fasta" "${b}.${s}.tmp"
|
21
|
+
else
|
22
|
+
gzip -cd "$TF/${b}.${s}.fasta.gz" > "${b}.${s}.tmp"
|
23
|
+
fi
|
24
|
+
done
|
25
|
+
FastA.interpose.pl "$cr" "$b".[12].tmp
|
26
|
+
rm "$b".[12].tmp
|
27
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
|
28
|
+
fi
|
21
29
|
fi
|
22
30
|
|
31
|
+
# Gzip (if needed)
|
32
|
+
for i in SingleReads CoupledReads ; do
|
33
|
+
base="$TF/${DATASET}.${i}.fa"
|
34
|
+
if [[ -e "$base" && ! -s "${base}.gz" ]] ; then
|
35
|
+
gzip -9f "$base"
|
36
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
|
37
|
+
fi
|
38
|
+
done
|
39
|
+
|
23
40
|
# Assemble
|
24
|
-
FA="$TF/$DATASET.CoupledReads.fa"
|
25
|
-
[[ -e "$FA" ]] || FA="$
|
26
|
-
[[ -e "$FA" ]] || FA="../04.trimmed_fasta/$DATASET.SingleReads.fa"
|
27
|
-
[[ -e "$FA" ]] || FA="$FA.gz"
|
41
|
+
FA="$TF/${DATASET}.CoupledReads.fa.gz"
|
42
|
+
[[ -e "$FA" ]] || FA="$TF/${DATASET}.SingleReads.fa.gz"
|
28
43
|
RD="r"
|
29
44
|
[[ $FA == *.SingleReads.fa* ]] && RD="l"
|
30
|
-
|
45
|
+
gzip -cd "$FA" \
|
46
|
+
| idba_ud --pre_correction -$RD /dev/stdin \
|
47
|
+
-o "$DATASET" --num_threads "$CORES" || true
|
31
48
|
[[ -s "$DATASET/contig.fa" ]] || exit 1
|
32
49
|
|
33
50
|
# Clean
|
34
|
-
(
|
35
|
-
cd "$DATASET"
|
36
|
-
rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa
|
37
|
-
)
|
51
|
+
( cd "$DATASET" && rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa )
|
38
52
|
|
39
53
|
# Extract
|
40
54
|
if [[ -s "$DATASET/scaffold.fa" ]] ; then
|
@@ -49,3 +63,4 @@ FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2>=1000{print $1}' \
|
|
49
63
|
# Finalize
|
50
64
|
miga date > "$DATASET.done"
|
51
65
|
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
66
|
+
|
data/scripts/haai_distances.bash
CHANGED
@@ -12,34 +12,10 @@ miga_start_project_step "$DIR"
|
|
12
12
|
# Cleanup databases
|
13
13
|
ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
|
14
14
|
|
15
|
-
#
|
15
|
+
# No real need for hAAI distributions at all
|
16
16
|
echo -n "" > miga-project.log
|
17
|
-
|
18
|
-
|
19
|
-
# Extract values
|
20
|
-
rm -f miga-project.txt
|
21
|
-
(
|
22
|
-
echo "metric a b value sd n omega" | tr " " "\\t"
|
23
|
-
for i in $DS ; do
|
24
|
-
echo "SELECT 'hAAI', seq1, seq2, aai, sd, n, omega from aai ;" \
|
25
|
-
| sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
|
26
|
-
echo "$i" >> miga-project.log
|
27
|
-
done
|
28
|
-
) | gzip -9c > miga-project.txt.gz
|
29
|
-
|
30
|
-
# R-ify
|
31
|
-
echo "
|
32
|
-
haai <- read.table(gzfile('miga-project.txt.gz'), sep='\\t', h=T, as.is=TRUE);
|
33
|
-
save(haai, file='miga-project.Rdata');
|
34
|
-
if(sum(haai[,'a'] != haai[,'b']) > 0){
|
35
|
-
h <- hist(haai[haai[,'a'] != haai[,'b'], 'value'], breaks=100, plot=FALSE);
|
36
|
-
write.table(
|
37
|
-
cbind(h[['breaks']][-length(h[['breaks']])],
|
38
|
-
h[['breaks']][-1], h[['counts']]),
|
39
|
-
file='miga-project.hist', quote=FALSE, sep='\\t',
|
40
|
-
col.names=FALSE, row.names=FALSE);
|
41
|
-
}
|
42
|
-
" | R --vanilla
|
17
|
+
echo -n "" > miga-project.txt
|
18
|
+
echo "aai <- NULL; save(aai, file = 'miga-project.Rdata')" | R --vanilla
|
43
19
|
|
44
20
|
# Finalize
|
45
21
|
miga_end_project_step "$DIR"
|
data/scripts/miga.bash
CHANGED
@@ -7,9 +7,11 @@ SCRIPT=${SCRIPT:-$(basename "$0" .bash)}
|
|
7
7
|
# shellcheck source=/dev/null
|
8
8
|
. "$MIGA_HOME/.miga_rc"
|
9
9
|
|
10
|
-
# Ensure submodules are first in PATH
|
11
|
-
export PATH="$MIGA/bin:$
|
12
|
-
|
10
|
+
# Ensure MiGA & submodules are first in PATH
|
11
|
+
export PATH="$MIGA/bin:$PATH"
|
12
|
+
for util in enveomics/Scripts FastAAI/FastAAI multitrim ; do
|
13
|
+
export PATH="$MIGA/utils/$util:$PATH"
|
14
|
+
done
|
13
15
|
|
14
16
|
# Ancillary functions
|
15
17
|
function exists { [[ -e "$1" ]] ; }
|
@@ -38,7 +40,7 @@ if [[ "$SCRIPT" != "d" && "$SCRIPT" != "p" ]] ; then
|
|
38
40
|
echo ""
|
39
41
|
echo "######[ $SCRIPT ]######"
|
40
42
|
echo "# Date: $(miga date)"
|
41
|
-
echo "# Host: $(hostname)"
|
43
|
+
echo "# Host: $(hostname) [$CORES]"
|
42
44
|
echo "# MiGA: $MIGA"
|
43
45
|
echo "# Project: $PROJECT"
|
44
46
|
if [[ -n $DATASET ]] ; then
|
data/scripts/p.bash
CHANGED
data/scripts/read_quality.bash
CHANGED
@@ -6,28 +6,19 @@ SCRIPT="read_quality"
|
|
6
6
|
. "$MIGA/scripts/miga.bash" || exit 1
|
7
7
|
cd "$PROJECT/data/03.read_quality"
|
8
8
|
|
9
|
-
b=$DATASET
|
10
|
-
|
11
9
|
# Initialize
|
12
10
|
miga date > "$DATASET.start"
|
13
11
|
|
14
|
-
#
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
# Clean 02.trimmed_reads
|
24
|
-
rm -f "../02.trimmed_reads/$b".[12].fastq_trimmed.segments
|
25
|
-
rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.paired
|
26
|
-
rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.single
|
27
|
-
rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed
|
28
|
-
rm -f "../02.trimmed_reads/$b".[12].fastq
|
29
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
12
|
+
# Gzip (if necessary)
|
13
|
+
for s in 1 2 ; do
|
14
|
+
in="../02.trimmed_reads/${DATASET}.${s}.clipped.fastq"
|
15
|
+
if [[ -s "$in" ]] ; then
|
16
|
+
gzip -9f "$in"
|
17
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
18
|
+
fi
|
19
|
+
done
|
30
20
|
|
31
21
|
# Finalize
|
32
22
|
miga date > "$DATASET.done"
|
33
23
|
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
24
|
+
|
data/scripts/trimmed_fasta.bash
CHANGED
@@ -11,43 +11,27 @@ b=$DATASET
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "$DATASET.start"
|
13
13
|
|
14
|
-
#
|
15
|
-
for
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
&& gzip -d "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz"
|
20
|
-
done
|
14
|
+
# FastQ -> FastA
|
15
|
+
for s in 1 2 ; do
|
16
|
+
in="../02.trimmed_reads/${b}.${s}.clipped.fastq.gz"
|
17
|
+
[[ -s "$in" ]] \
|
18
|
+
&& FastQ.maskQual.rb -i "$in" -o "${b}.1.fasta" --fasta --qual 18
|
21
19
|
done
|
22
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
23
20
|
|
24
|
-
#
|
25
|
-
|
26
|
-
|
27
|
-
if [[ -e "../02.trimmed_reads/$b.2.clipped.fastq" ]] ; then
|
28
|
-
awk -f "$FQ2A" < "../02.trimmed_reads/$b.2.clipped.fastq" > "$b.2.fasta"
|
29
|
-
FastA.interpose.pl "$b.CoupledReads.fa" "$b".[12].fasta
|
30
|
-
gzip -9 -f "$b.2.fasta"
|
31
|
-
gzip -9 -f "$b.1.fasta"
|
32
|
-
awk -f "$FQ2A" < "../02.trimmed_reads/$b".[12].clipped.single.fastq \
|
33
|
-
> "$b.SingleReads.fa"
|
34
|
-
gzip -9 -f "$b.SingleReads.fa"
|
21
|
+
# Interpose
|
22
|
+
if [[ -e "${b}.2.fasta" ]] ; then
|
23
|
+
FastA.interpose.pl "${b}.CoupledReads.fa" "$b".[12].fasta
|
35
24
|
else
|
36
|
-
mv "$b.1.fasta" "$b.SingleReads.fa"
|
25
|
+
mv "${b}.1.fasta" "${b}.SingleReads.fa"
|
37
26
|
fi
|
38
27
|
|
39
|
-
#
|
40
|
-
for
|
41
|
-
|
42
|
-
|
43
|
-
[[ -e "../02.trimmed_reads/$b.$sis.clipped.fastq" ]] \
|
44
|
-
&& gzip -9 -f "../02.trimmed_reads/$b.$sis.clipped.fastq"
|
45
|
-
[[ -e "../02.trimmed_reads/$b.$sis.clipped.single.fastq" ]] \
|
46
|
-
&& gzip -9 -f "../02.trimmed_reads/$b.$sis.clipped.single.fastq"
|
28
|
+
# Gzip
|
29
|
+
for x in 1.fasta 2.fasta SingleReads.fa CoupledReads.fa ; do
|
30
|
+
in="${b}.${x}"
|
31
|
+
[[ -e "$in" ]] && gzip -9f "$in"
|
47
32
|
done
|
48
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
|
49
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
|
50
33
|
|
51
34
|
# Finalize
|
52
35
|
miga date > "$DATASET.done"
|
53
36
|
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
37
|
+
|
data/scripts/trimmed_reads.bash
CHANGED
@@ -11,49 +11,49 @@ b=$DATASET
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "$DATASET.start"
|
13
13
|
|
14
|
-
# Unzip (if necessary)
|
15
|
-
[[ -e "../01.raw_reads/$b.1.fastq.gz" && ! -e "../01.raw_reads/$b.1.fastq" ]] \
|
16
|
-
&& gunzip "../01.raw_reads/$b.1.fastq.gz"
|
17
|
-
[[ -e "../01.raw_reads/$b.2.fastq.gz" && ! -e "../01.raw_reads/$b.2.fastq" ]] \
|
18
|
-
&& gunzip "../01.raw_reads/$b.2.fastq.gz"
|
19
|
-
miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
|
20
|
-
|
21
14
|
# Clean existing files
|
22
15
|
exists "$b".[12].* && rm "$b".[12].*
|
23
16
|
|
17
|
+
# Gzip (if necessary)
|
18
|
+
for s in 1 2 ; do
|
19
|
+
in="../01.raw_reads/${b}.${s}.fastq"
|
20
|
+
if [[ -s "$in" ]] ; then
|
21
|
+
gzip -9f "$in"
|
22
|
+
miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
|
23
|
+
fi
|
24
|
+
done
|
25
|
+
|
24
26
|
# Tag
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
#
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
SolexaQA++ lengthsort "$b".[12].clipped.all.fastq -l 50 -d .
|
41
|
-
rm "$b".[12].clipped.all.fastq
|
42
|
-
[[ -e "$b".1.clipped.all.fastq.single ]] \
|
43
|
-
&& mv "$b.1.clipped.all.fastq.single" "$b.1.clipped.single.fastq"
|
44
|
-
[[ -e "$b".2.clipped.all.fastq.single ]] \
|
45
|
-
&& mv "$b.2.clipped.all.fastq.single" "$b.2.clipped.single.fastq"
|
46
|
-
mv "$b.1.clipped.all.fastq.paired" "$b.1.clipped.fastq"
|
47
|
-
mv "$b.2.clipped.all.fastq.paired" "$b.2.clipped.fastq"
|
48
|
-
rm -f "$b.1.clipped.all.fastq.summary.txt"
|
27
|
+
in1="../01.raw_reads/$b.1.fastq.gz"
|
28
|
+
in2="../01.raw_reads/$b.2.fastq.gz"
|
29
|
+
FastQ.tag.rb -i "$in1" -p "$b-" -s "/1" -o "$b.1.fastq.gz"
|
30
|
+
[[ -e "$in2" ]] && FastQ.tag.rb -i "$in2" -p "$b-" -s "/2" -o "$b.2.fastq.gz"
|
31
|
+
|
32
|
+
# Multitrim
|
33
|
+
CMD="multitrim.py --zip gzip --level 9 --threads $CORES -o $b"
|
34
|
+
if [[ -s "$b.2.fastq.gz" ]] ; then
|
35
|
+
# Paired
|
36
|
+
$CMD -1 "$b.1.fastq.gz" -2 "$b.2.fastq.gz"
|
37
|
+
for s in 1 2 ; do
|
38
|
+
mv "$b/${s}.post_trim_${b}.${s}.fq.gz" "${b}.${s}.clipped.fastq.gz"
|
39
|
+
mv "$b/${s}.pre_trim_QC_${b}.${s}.html" "../03.read_quality/${b}.pre.${s}.html"
|
40
|
+
mv "$b/${s}.post_trim_QC_${b}.${s}.html" "../03.read_quality/${b}.post.${s}.html"
|
41
|
+
done
|
49
42
|
else
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
mv "$b.1.
|
43
|
+
# Unpaired
|
44
|
+
$CMD -u "$b.1.fastq.gz"
|
45
|
+
mv "$b/unpaired.post_trim_${b}.1.fq.gz" "${b}.1.clipped.fastq.gz"
|
46
|
+
mv "$b/unpaired.pre_trim_QC_${b}.1.html" "../03.read_quality/${b}.pre.1.html"
|
47
|
+
mv "$b/unpaired.post_trim_QC_${b}.1.html" "../03.read_quality/${b}.post.1.html"
|
54
48
|
fi
|
55
|
-
|
49
|
+
mv "$b/Subsample_Adapter_Detection.stats.txt" \
|
50
|
+
"../03.read_quality/$b.adapters.txt"
|
51
|
+
|
52
|
+
# Cleanup
|
53
|
+
rm -r "$b"
|
54
|
+
rm -f "$b".[12].fastq.gz
|
56
55
|
|
57
56
|
# Finalize
|
58
57
|
miga date > "$DATASET.done"
|
59
58
|
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
59
|
+
|