miga-base 1.2.12.1 → 1.2.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/common/format.rb +3 -0
- data/lib/miga/dataset/base.rb +4 -4
- data/lib/miga/dataset/result.rb +5 -6
- data/lib/miga/result/stats.rb +38 -4
- data/lib/miga/version.rb +2 -2
- data/scripts/ssu.bash +35 -15
- data/scripts/stats.bash +1 -26
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c997637653ab5c7174be88b049f9ec78c432469a0bf08dc8c3572563480733b8
|
4
|
+
data.tar.gz: 731b7cad9da3266f2d21161dd5669d20214a648cded2a9cb9db3b0f23881a3fb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af2b18c2d3153ee02a8a7aa6f79efda6f451678e63353e9a9e02b9248915d63536ab46aabe5d9e55d942da51d38d66df77c67aa25e15e2df604cf1e6af075dc3
|
7
|
+
data.tar.gz: 3cce4b8c2cce42a87d31e93c14c7c838ad40a90b5daa3c4220070cada7c0b092dfcc7c76e6c3744dcb9d144d9570e552046a6fec82cadfced72857814be27c8a
|
data/lib/miga/common/format.rb
CHANGED
@@ -155,6 +155,9 @@ class String
|
|
155
155
|
.gsub(/g_c_(skew)/, 'G-C \\1')
|
156
156
|
.gsub(/a_t_(skew)/, 'A-T \\1')
|
157
157
|
.gsub(/x_content/, &:capitalize)
|
158
|
+
.gsub(/(^|_)([sl]su|a[an]i)(_|$)/, &:upcase)
|
159
|
+
.gsub(/^trna_/, 'tRNA ')
|
160
|
+
.gsub(/tRNA aa/, 'tRNA AA')
|
158
161
|
.tr('_', ' ')
|
159
162
|
end
|
160
163
|
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -38,13 +38,13 @@ module MiGA::Dataset::Base
|
|
38
38
|
cds: '06.cds',
|
39
39
|
# Annotation
|
40
40
|
essential_genes: '07.annotation/01.function/01.essential',
|
41
|
-
ssu: '07.annotation/01.function/02.ssu',
|
42
41
|
mytaxa: '07.annotation/02.taxonomy/01.mytaxa',
|
43
42
|
mytaxa_scan: '07.annotation/03.qa/02.mytaxa_scan',
|
44
43
|
# Distances (for single-species datasets)
|
45
44
|
taxonomy: '09.distances/05.taxonomy',
|
46
45
|
distances: '09.distances',
|
47
|
-
#
|
46
|
+
# Post-QC
|
47
|
+
ssu: '07.annotation/01.function/02.ssu',
|
48
48
|
stats: '90.stats'
|
49
49
|
}
|
50
50
|
|
@@ -72,8 +72,8 @@ module MiGA::Dataset::Base
|
|
72
72
|
# Returns an Array of tasks to be executed before project-wide tasks
|
73
73
|
@@PREPROCESSING_TASKS = [
|
74
74
|
:raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
|
75
|
-
:assembly, :cds, :essential_genes, :
|
76
|
-
:taxonomy, :distances, :stats
|
75
|
+
:assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
|
76
|
+
:taxonomy, :distances, :ssu, :stats
|
77
77
|
]
|
78
78
|
|
79
79
|
##
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -295,9 +295,11 @@ module MiGA::Dataset::Result
|
|
295
295
|
r = add_files_to_ds_result(
|
296
296
|
MiGA::Result.new("#{base}.json"), name,
|
297
297
|
longest_ssu_gene: '.ssu.fa',
|
298
|
-
|
298
|
+
ssu_gff: '.ssu.gff', # DEPRECATED
|
299
|
+
gff: '.gff',
|
299
300
|
all_ssu_genes: '.ssu.all.fa',
|
300
|
-
classification: '.rdp.tsv'
|
301
|
+
classification: '.rdp.tsv',
|
302
|
+
trna_list: '.trna.txt'
|
301
303
|
)
|
302
304
|
opts[:is_clean] ||= false
|
303
305
|
r.clean! if opts[:is_clean]
|
@@ -384,10 +386,7 @@ module MiGA::Dataset::Result
|
|
384
386
|
##
|
385
387
|
# Add result type +:stats+ at +base+ (no +_opts+ supported)
|
386
388
|
def add_result_stats(base, _opts)
|
387
|
-
|
388
|
-
MiGA::Result.new("#{base}.json"), name,
|
389
|
-
trna_list: '.trna.txt'
|
390
|
-
)
|
389
|
+
MiGA::Result.new("#{base}.json")
|
391
390
|
end
|
392
391
|
|
393
392
|
##
|
data/lib/miga/result/stats.rb
CHANGED
@@ -159,18 +159,52 @@ module MiGA::Result::Stats
|
|
159
159
|
end
|
160
160
|
|
161
161
|
def compute_stats_ssu
|
162
|
-
stats = {
|
162
|
+
stats = {
|
163
|
+
ssu: 0, complete_ssu: 0, ssu_fragment: 0.0,
|
164
|
+
lsu: 0, complete_lsu: 0, lsu_fragment: 0.0
|
165
|
+
}
|
166
|
+
|
163
167
|
Zlib::GzipReader.open(file_path(:gff)) do |fh|
|
164
168
|
fh.each_line do |ln|
|
165
169
|
next if ln =~ /^#/
|
166
170
|
|
167
171
|
rl = ln.chomp.split("\t")
|
168
|
-
|
169
|
-
|
172
|
+
feat = Hash[rl[8].split(';').map { |i| i.split('=', 2) }]
|
173
|
+
subunit = feat['Name'] == '16S_rRNA' ? :ssu : :lsu
|
174
|
+
if subunit == :ssu
|
175
|
+
len = (rl[4].to_i - rl[3].to_i).abs + 1
|
176
|
+
stats[:max_length] = [stats[:max_length] || 0, len].max
|
177
|
+
end
|
170
178
|
stats[:ssu] += 1
|
171
|
-
|
179
|
+
if feat['product'] =~ /\(partial\)/
|
180
|
+
if feat['note'] =~ /aligned only (\d+) percent/
|
181
|
+
fragment = $1.to_f
|
182
|
+
stats[:"#{subunit}_fragment"] ||= [fragment, '%']
|
183
|
+
if fragment > stats[:"#{subunit}_fragment"][0]
|
184
|
+
stats[:"#{subunit}_fragment"][0] = fragment
|
185
|
+
end
|
186
|
+
end
|
187
|
+
else
|
188
|
+
stats[:"complete_#{subunit}"] += 1
|
189
|
+
stats[:"#{subunit}_fragment"] = [100.0, '%']
|
190
|
+
end
|
172
191
|
end
|
173
192
|
end
|
193
|
+
|
194
|
+
Zlib::GzipReader.open(file_path(:trna_list)) do |fh|
|
195
|
+
no = 0
|
196
|
+
stats[:trna_count] = 0
|
197
|
+
aa = {}
|
198
|
+
fh.each_line do |ln|
|
199
|
+
next if (no += 1) < 4
|
200
|
+
stats[:trna_count] += 1
|
201
|
+
row = ln.chomp.split("\t")
|
202
|
+
next if row[9] == 'pseudo' || row[4] == 'Undet'
|
203
|
+
aa[row[4].gsub(/^f?([A-Za-z]+)[0-9]?/, '\1')] = true
|
204
|
+
end
|
205
|
+
stats[:trna_aa] = aa.size
|
206
|
+
end if file_path(:trna_list)
|
207
|
+
|
174
208
|
stats
|
175
209
|
end
|
176
210
|
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.2,
|
15
|
+
VERSION = [1.2, 13, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2022, 12,
|
23
|
+
VERSION_DATE = Date.new(2022, 12, 31)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/ssu.bash
CHANGED
@@ -13,36 +13,56 @@ miga date > "$DATASET.start"
|
|
13
13
|
|
14
14
|
fa="../../../05.assembly/$DATASET.LargeContigs.fna"
|
15
15
|
if [[ -s $fa ]] ; then
|
16
|
+
# Get domain
|
17
|
+
d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
|
18
|
+
if [[ "$d" != "Bacteria" && "$d" != "Archaea" && "$d" != "Eukaryota" ]] ; then
|
19
|
+
d="Bacteria" # Assume Bacteria in the absence of additional information
|
20
|
+
fi
|
21
|
+
|
16
22
|
# Run barrnap
|
17
|
-
|
18
|
-
|
23
|
+
dom_opt="$(echo "$d" | perl -ne 'print lc' | head -c 3)"
|
24
|
+
barrnap --quiet --kingdom "$dom_opt" --threads "$CORES" "$fa" \
|
25
|
+
> "${DATASET}.gff"
|
19
26
|
|
20
27
|
# Extract
|
21
|
-
|
22
|
-
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
+
grep "^##gff\\|;product=16S " < "${DATASET}.gff" \
|
29
|
+
| bedtools getfasta -s "-fi" "$fa" -bed /dev/stdin \
|
30
|
+
-fo "${DATASET}.ssu.all.fa"
|
31
|
+
FastA.length.pl "${DATASET}.ssu.all.fa" | sort -nr -k 2 | head -n 1 \
|
32
|
+
| cut -f 1 > "${DATASET}.ssu.fa.id"
|
33
|
+
FastA.filter.pl "${DATASET}.ssu.fa.id" "${DATASET}.ssu.all.fa" \
|
34
|
+
> "${DATASET}.ssu.fa"
|
35
|
+
rm "${DATASET}.ssu.fa.id"
|
36
|
+
[[ -e "${fa}.fai" ]] && rm "${fa}.fai"
|
28
37
|
|
29
38
|
# RDP classifier
|
30
|
-
if [[ "$MIGA_RDP" == "yes" && -s "$DATASET.ssu.all.fa" ]] ; then
|
39
|
+
if [[ "$MIGA_RDP" == "yes" && -s "${DATASET}.ssu.all.fa" ]] ; then
|
31
40
|
java -jar "$MIGA_HOME/.miga_db/classifier.jar" classify \
|
32
|
-
-c 0.8 -f fixrank -g 16srrna -o "$DATASET.rdp.tsv" \
|
33
|
-
"$DATASET.ssu.all.fa"
|
41
|
+
-c 0.8 -f fixrank -g 16srrna -o "${DATASET}.rdp.tsv" \
|
42
|
+
"${DATASET}.ssu.all.fa"
|
34
43
|
echo "# Version: $(perl -pe 's/.*://' \
|
35
44
|
< "$MIGA_HOME/.miga_db/classifier.version.txt" \
|
36
45
|
| grep . | paste - - | perl -pe 's/\t/; /')" \
|
37
|
-
>> "$DATASET.rdp.tsv"
|
46
|
+
>> "${DATASET}.rdp.tsv"
|
38
47
|
fi
|
39
48
|
|
49
|
+
# tRNAscan-SE
|
50
|
+
dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
|
51
|
+
out="${DATASET}.trna.txt"
|
52
|
+
# `echo O` is to avoid a hang from a pre-existing output file.
|
53
|
+
# This is better than pre-checking (and removing), because it avoids
|
54
|
+
# the (unlikely) scenario of a file racing (e.g., a file created right
|
55
|
+
# before tRNAscan-SE starts, or a `rm` failure).
|
56
|
+
#
|
57
|
+
# The trailing `|| true` is to treat failure as non-fatal
|
58
|
+
echo O | tRNAscan-SE $dom_opt -o "${DATASET}.trna.txt" -q "$fa" || true
|
59
|
+
|
40
60
|
# Gzip
|
41
|
-
for x in
|
61
|
+
for x in gff ssu.all.fa rdp.tsv trna.txt ; do
|
42
62
|
[[ -e "${DATASET}.${x}" ]] && gzip -9 -f "${DATASET}.${x}"
|
43
63
|
done
|
44
64
|
fi
|
45
65
|
|
46
66
|
# Finalize
|
47
|
-
miga date > "$DATASET.done"
|
67
|
+
miga date > "${DATASET}.done"
|
48
68
|
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
data/scripts/stats.bash
CHANGED
@@ -11,33 +11,8 @@ cd "$DIR"
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "${DATASET}.start"
|
13
13
|
|
14
|
-
# tRNAscan-SE
|
15
|
-
fa="../05.assembly/${DATASET}.LargeContigs.fna"
|
16
|
-
if [[ -s "$fa" ]] ; then
|
17
|
-
d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
|
18
|
-
if [[ "$d" == "Bacteria" || "$d" == "Archaea" || "$d" == "Eukaryota" ]] ; then
|
19
|
-
dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
|
20
|
-
out="${DATASET}.trna.txt"
|
21
|
-
# `echo O` is to avoid a hang from a pre-existing output file.
|
22
|
-
# This is better than pre-checking (and removing), because it avoids
|
23
|
-
# the (unlikely) scenario of a file racing (e.g., a file created right
|
24
|
-
# before tRNAscan-SE starts, or a `rm` failure).
|
25
|
-
#
|
26
|
-
# The trailing `|| true` is to treat failure as non-fatal
|
27
|
-
echo O | tRNAscan-SE $dom_opt -o "$out" -q "$fa" || true
|
28
|
-
if [[ -s "$out" ]] ; then
|
29
|
-
cnt=$(tail -n +4 "$out" | wc -l | awk '{print $1}')
|
30
|
-
aa="$(tail -n +4 "$out" | grep -v 'pseudo$' | awk '{print $5}' \
|
31
|
-
| grep -v 'Undet' | perl -pe 's/^f?([A-Za-z]+)[0-9]?/$1/' \
|
32
|
-
| sort | uniq | wc -l | awk '{print $1}')"
|
33
|
-
miga edit -P "$PROJECT" -D "$DATASET" \
|
34
|
-
-m "trna_count=Int($cnt),trna_aa=Int($aa)"
|
35
|
-
fi
|
36
|
-
fi
|
37
|
-
fi
|
38
|
-
|
39
14
|
# Calculate statistics
|
40
|
-
for i in raw_reads trimmed_fasta assembly cds essential_genes
|
15
|
+
for i in raw_reads trimmed_fasta assembly cds essential_genes distances taxonomy ssu ; do
|
41
16
|
echo "# $i"
|
42
17
|
miga stats --compute-and-save --ignore-empty -P "$PROJECT" -D "$DATASET" -r $i
|
43
18
|
done
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-12-
|
11
|
+
date: 2022-12-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|