miga-base 1.2.12.0 → 1.2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/objects_helper.rb +3 -0
- data/lib/miga/common/format.rb +3 -0
- data/lib/miga/dataset/base.rb +4 -4
- data/lib/miga/dataset/result.rb +5 -6
- data/lib/miga/result/stats.rb +38 -4
- data/lib/miga/version.rb +2 -2
- data/scripts/ssu.bash +35 -15
- data/scripts/stats.bash +1 -26
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c997637653ab5c7174be88b049f9ec78c432469a0bf08dc8c3572563480733b8
|
4
|
+
data.tar.gz: 731b7cad9da3266f2d21161dd5669d20214a648cded2a9cb9db3b0f23881a3fb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af2b18c2d3153ee02a8a7aa6f79efda6f451678e63353e9a9e02b9248915d63536ab46aabe5d9e55d942da51d38d66df77c67aa25e15e2df604cf1e6af075dc3
|
7
|
+
data.tar.gz: 3cce4b8c2cce42a87d31e93c14c7c838ad40a90b5daa3c4220070cada7c0b092dfcc7c76e6c3744dcb9d144d9570e552046a6fec82cadfced72857814be27c8a
|
@@ -99,6 +99,9 @@ module MiGA::Cli::ObjectsHelper
|
|
99
99
|
when 'true'; v = true
|
100
100
|
when 'false'; v = false
|
101
101
|
when 'nil'; v = nil
|
102
|
+
when /^(Int|Float)\(.*\)$/
|
103
|
+
tr = v =~ /^Int/ ? :to_i : :to_f
|
104
|
+
v = v.gsub(/.*\((.*)\)/, '\1').send(tr)
|
102
105
|
end
|
103
106
|
if k == '_step'
|
104
107
|
obj.metadata["_try_#{v}"] ||= 0
|
data/lib/miga/common/format.rb
CHANGED
@@ -155,6 +155,9 @@ class String
|
|
155
155
|
.gsub(/g_c_(skew)/, 'G-C \\1')
|
156
156
|
.gsub(/a_t_(skew)/, 'A-T \\1')
|
157
157
|
.gsub(/x_content/, &:capitalize)
|
158
|
+
.gsub(/(^|_)([sl]su|a[an]i)(_|$)/, &:upcase)
|
159
|
+
.gsub(/^trna_/, 'tRNA ')
|
160
|
+
.gsub(/tRNA aa/, 'tRNA AA')
|
158
161
|
.tr('_', ' ')
|
159
162
|
end
|
160
163
|
|
data/lib/miga/dataset/base.rb
CHANGED
@@ -38,13 +38,13 @@ module MiGA::Dataset::Base
|
|
38
38
|
cds: '06.cds',
|
39
39
|
# Annotation
|
40
40
|
essential_genes: '07.annotation/01.function/01.essential',
|
41
|
-
ssu: '07.annotation/01.function/02.ssu',
|
42
41
|
mytaxa: '07.annotation/02.taxonomy/01.mytaxa',
|
43
42
|
mytaxa_scan: '07.annotation/03.qa/02.mytaxa_scan',
|
44
43
|
# Distances (for single-species datasets)
|
45
44
|
taxonomy: '09.distances/05.taxonomy',
|
46
45
|
distances: '09.distances',
|
47
|
-
#
|
46
|
+
# Post-QC
|
47
|
+
ssu: '07.annotation/01.function/02.ssu',
|
48
48
|
stats: '90.stats'
|
49
49
|
}
|
50
50
|
|
@@ -72,8 +72,8 @@ module MiGA::Dataset::Base
|
|
72
72
|
# Returns an Array of tasks to be executed before project-wide tasks
|
73
73
|
@@PREPROCESSING_TASKS = [
|
74
74
|
:raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
|
75
|
-
:assembly, :cds, :essential_genes, :
|
76
|
-
:taxonomy, :distances, :stats
|
75
|
+
:assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
|
76
|
+
:taxonomy, :distances, :ssu, :stats
|
77
77
|
]
|
78
78
|
|
79
79
|
##
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -295,9 +295,11 @@ module MiGA::Dataset::Result
|
|
295
295
|
r = add_files_to_ds_result(
|
296
296
|
MiGA::Result.new("#{base}.json"), name,
|
297
297
|
longest_ssu_gene: '.ssu.fa',
|
298
|
-
|
298
|
+
ssu_gff: '.ssu.gff', # DEPRECATED
|
299
|
+
gff: '.gff',
|
299
300
|
all_ssu_genes: '.ssu.all.fa',
|
300
|
-
classification: '.rdp.tsv'
|
301
|
+
classification: '.rdp.tsv',
|
302
|
+
trna_list: '.trna.txt'
|
301
303
|
)
|
302
304
|
opts[:is_clean] ||= false
|
303
305
|
r.clean! if opts[:is_clean]
|
@@ -384,10 +386,7 @@ module MiGA::Dataset::Result
|
|
384
386
|
##
|
385
387
|
# Add result type +:stats+ at +base+ (no +_opts+ supported)
|
386
388
|
def add_result_stats(base, _opts)
|
387
|
-
|
388
|
-
MiGA::Result.new("#{base}.json"), name,
|
389
|
-
trna_list: '.trna.txt'
|
390
|
-
)
|
389
|
+
MiGA::Result.new("#{base}.json")
|
391
390
|
end
|
392
391
|
|
393
392
|
##
|
data/lib/miga/result/stats.rb
CHANGED
@@ -159,18 +159,52 @@ module MiGA::Result::Stats
|
|
159
159
|
end
|
160
160
|
|
161
161
|
def compute_stats_ssu
|
162
|
-
stats = {
|
162
|
+
stats = {
|
163
|
+
ssu: 0, complete_ssu: 0, ssu_fragment: 0.0,
|
164
|
+
lsu: 0, complete_lsu: 0, lsu_fragment: 0.0
|
165
|
+
}
|
166
|
+
|
163
167
|
Zlib::GzipReader.open(file_path(:gff)) do |fh|
|
164
168
|
fh.each_line do |ln|
|
165
169
|
next if ln =~ /^#/
|
166
170
|
|
167
171
|
rl = ln.chomp.split("\t")
|
168
|
-
|
169
|
-
|
172
|
+
feat = Hash[rl[8].split(';').map { |i| i.split('=', 2) }]
|
173
|
+
subunit = feat['Name'] == '16S_rRNA' ? :ssu : :lsu
|
174
|
+
if subunit == :ssu
|
175
|
+
len = (rl[4].to_i - rl[3].to_i).abs + 1
|
176
|
+
stats[:max_length] = [stats[:max_length] || 0, len].max
|
177
|
+
end
|
170
178
|
stats[:ssu] += 1
|
171
|
-
|
179
|
+
if feat['product'] =~ /\(partial\)/
|
180
|
+
if feat['note'] =~ /aligned only (\d+) percent/
|
181
|
+
fragment = $1.to_f
|
182
|
+
stats[:"#{subunit}_fragment"] ||= [fragment, '%']
|
183
|
+
if fragment > stats[:"#{subunit}_fragment"][0]
|
184
|
+
stats[:"#{subunit}_fragment"][0] = fragment
|
185
|
+
end
|
186
|
+
end
|
187
|
+
else
|
188
|
+
stats[:"complete_#{subunit}"] += 1
|
189
|
+
stats[:"#{subunit}_fragment"] = [100.0, '%']
|
190
|
+
end
|
172
191
|
end
|
173
192
|
end
|
193
|
+
|
194
|
+
Zlib::GzipReader.open(file_path(:trna_list)) do |fh|
|
195
|
+
no = 0
|
196
|
+
stats[:trna_count] = 0
|
197
|
+
aa = {}
|
198
|
+
fh.each_line do |ln|
|
199
|
+
next if (no += 1) < 4
|
200
|
+
stats[:trna_count] += 1
|
201
|
+
row = ln.chomp.split("\t")
|
202
|
+
next if row[9] == 'pseudo' || row[4] == 'Undet'
|
203
|
+
aa[row[4].gsub(/^f?([A-Za-z]+)[0-9]?/, '\1')] = true
|
204
|
+
end
|
205
|
+
stats[:trna_aa] = aa.size
|
206
|
+
end if file_path(:trna_list)
|
207
|
+
|
174
208
|
stats
|
175
209
|
end
|
176
210
|
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.2,
|
15
|
+
VERSION = [1.2, 13, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2022, 12,
|
23
|
+
VERSION_DATE = Date.new(2022, 12, 31)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/ssu.bash
CHANGED
@@ -13,36 +13,56 @@ miga date > "$DATASET.start"
|
|
13
13
|
|
14
14
|
fa="../../../05.assembly/$DATASET.LargeContigs.fna"
|
15
15
|
if [[ -s $fa ]] ; then
|
16
|
+
# Get domain
|
17
|
+
d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
|
18
|
+
if [[ "$d" != "Bacteria" && "$d" != "Archaea" && "$d" != "Eukaryota" ]] ; then
|
19
|
+
d="Bacteria" # Assume Bacteria in the absence of additional information
|
20
|
+
fi
|
21
|
+
|
16
22
|
# Run barrnap
|
17
|
-
|
18
|
-
|
23
|
+
dom_opt="$(echo "$d" | perl -ne 'print lc' | head -c 3)"
|
24
|
+
barrnap --quiet --kingdom "$dom_opt" --threads "$CORES" "$fa" \
|
25
|
+
> "${DATASET}.gff"
|
19
26
|
|
20
27
|
# Extract
|
21
|
-
|
22
|
-
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
+
grep "^##gff\\|;product=16S " < "${DATASET}.gff" \
|
29
|
+
| bedtools getfasta -s "-fi" "$fa" -bed /dev/stdin \
|
30
|
+
-fo "${DATASET}.ssu.all.fa"
|
31
|
+
FastA.length.pl "${DATASET}.ssu.all.fa" | sort -nr -k 2 | head -n 1 \
|
32
|
+
| cut -f 1 > "${DATASET}.ssu.fa.id"
|
33
|
+
FastA.filter.pl "${DATASET}.ssu.fa.id" "${DATASET}.ssu.all.fa" \
|
34
|
+
> "${DATASET}.ssu.fa"
|
35
|
+
rm "${DATASET}.ssu.fa.id"
|
36
|
+
[[ -e "${fa}.fai" ]] && rm "${fa}.fai"
|
28
37
|
|
29
38
|
# RDP classifier
|
30
|
-
if [[ "$MIGA_RDP" == "yes" && -s "$DATASET.ssu.all.fa" ]] ; then
|
39
|
+
if [[ "$MIGA_RDP" == "yes" && -s "${DATASET}.ssu.all.fa" ]] ; then
|
31
40
|
java -jar "$MIGA_HOME/.miga_db/classifier.jar" classify \
|
32
|
-
-c 0.8 -f fixrank -g 16srrna -o "$DATASET.rdp.tsv" \
|
33
|
-
"$DATASET.ssu.all.fa"
|
41
|
+
-c 0.8 -f fixrank -g 16srrna -o "${DATASET}.rdp.tsv" \
|
42
|
+
"${DATASET}.ssu.all.fa"
|
34
43
|
echo "# Version: $(perl -pe 's/.*://' \
|
35
44
|
< "$MIGA_HOME/.miga_db/classifier.version.txt" \
|
36
45
|
| grep . | paste - - | perl -pe 's/\t/; /')" \
|
37
|
-
>> "$DATASET.rdp.tsv"
|
46
|
+
>> "${DATASET}.rdp.tsv"
|
38
47
|
fi
|
39
48
|
|
49
|
+
# tRNAscan-SE
|
50
|
+
dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
|
51
|
+
out="${DATASET}.trna.txt"
|
52
|
+
# `echo O` is to avoid a hang from a pre-existing output file.
|
53
|
+
# This is better than pre-checking (and removing), because it avoids
|
54
|
+
# the (unlikely) scenario of a file racing (e.g., a file created right
|
55
|
+
# before tRNAscan-SE starts, or a `rm` failure).
|
56
|
+
#
|
57
|
+
# The trailing `|| true` is to treat failure as non-fatal
|
58
|
+
echo O | tRNAscan-SE $dom_opt -o "${DATASET}.trna.txt" -q "$fa" || true
|
59
|
+
|
40
60
|
# Gzip
|
41
|
-
for x in
|
61
|
+
for x in gff ssu.all.fa rdp.tsv trna.txt ; do
|
42
62
|
[[ -e "${DATASET}.${x}" ]] && gzip -9 -f "${DATASET}.${x}"
|
43
63
|
done
|
44
64
|
fi
|
45
65
|
|
46
66
|
# Finalize
|
47
|
-
miga date > "$DATASET.done"
|
67
|
+
miga date > "${DATASET}.done"
|
48
68
|
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
|
data/scripts/stats.bash
CHANGED
@@ -11,33 +11,8 @@ cd "$DIR"
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "${DATASET}.start"
|
13
13
|
|
14
|
-
# tRNAscan-SE
|
15
|
-
fa="../05.assembly/${DATASET}.LargeContigs.fna"
|
16
|
-
if [[ -s "$fa" ]] ; then
|
17
|
-
d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
|
18
|
-
if [[ "$d" == "Bacteria" || "$d" == "Archaea" || "$d" == "Eukaryota" ]] ; then
|
19
|
-
dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
|
20
|
-
out="${DATASET}.trna.txt"
|
21
|
-
# `echo O` is to avoid a hang from a pre-existing output file.
|
22
|
-
# This is better than pre-checking (and removing), because it avoids
|
23
|
-
# the (unlikely) scenario of a file racing (e.g., a file created right
|
24
|
-
# before tRNAscan-SE starts, or a `rm` failure).
|
25
|
-
#
|
26
|
-
# The trailing `|| true` is to treat failure as non-fatal
|
27
|
-
echo O | tRNAscan-SE $dom_opt -o "$out" -q "$fa" || true
|
28
|
-
if [[ -s "$out" ]] ; then
|
29
|
-
cnt=$(tail -n +4 "$out" | wc -l | awk '{print $1}')
|
30
|
-
aa="$(tail -n +4 "$out" | grep -v 'pseudo$' | awk '{print $5}' \
|
31
|
-
| grep -v 'Undet' | perl -pe 's/^f?([A-Za-z]+)[0-9]?/$1/' \
|
32
|
-
| sort | uniq | wc -l | awk '{print $1}')"
|
33
|
-
miga edit -P "$PROJECT" -D "$DATASET" \
|
34
|
-
-m "trna_count=Int($cnt),trna_aa=Int($aa)"
|
35
|
-
fi
|
36
|
-
fi
|
37
|
-
fi
|
38
|
-
|
39
14
|
# Calculate statistics
|
40
|
-
for i in raw_reads trimmed_fasta assembly cds essential_genes
|
15
|
+
for i in raw_reads trimmed_fasta assembly cds essential_genes distances taxonomy ssu ; do
|
41
16
|
echo "# $i"
|
42
17
|
miga stats --compute-and-save --ignore-empty -P "$PROJECT" -D "$DATASET" -r $i
|
43
18
|
done
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-12-
|
11
|
+
date: 2022-12-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|