miga-base 1.2.12.1 → 1.2.13.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 176a9a20427a4b14db0418fb1140025d2a5dc3737e4e6c55926524bd4d0f4d32
4
- data.tar.gz: 4d1dca0ba71ea2ad86f4a4ffe5fbda8a8146f91464f092efe780ba92598bfd36
3
+ metadata.gz: c997637653ab5c7174be88b049f9ec78c432469a0bf08dc8c3572563480733b8
4
+ data.tar.gz: 731b7cad9da3266f2d21161dd5669d20214a648cded2a9cb9db3b0f23881a3fb
5
5
  SHA512:
6
- metadata.gz: 521cb14499220bcf5cec8d5a532f7e8fe2708eac6ca7bdd53e5ca5472857d0a57f08750a232cbbd2cf704b082295a6668ec275e0956fc08a783dda13b0cc0471
7
- data.tar.gz: 451d7f35d2e1666b3125fa58a85c171eb89310379a6acd67c1fd19bb0e1274e1cfe0a93775ccf42e93f03ea5f0e85b6045147fc8272b0d26e57bfdb97ecfa623
6
+ metadata.gz: af2b18c2d3153ee02a8a7aa6f79efda6f451678e63353e9a9e02b9248915d63536ab46aabe5d9e55d942da51d38d66df77c67aa25e15e2df604cf1e6af075dc3
7
+ data.tar.gz: 3cce4b8c2cce42a87d31e93c14c7c838ad40a90b5daa3c4220070cada7c0b092dfcc7c76e6c3744dcb9d144d9570e552046a6fec82cadfced72857814be27c8a
@@ -155,6 +155,9 @@ class String
155
155
  .gsub(/g_c_(skew)/, 'G-C \\1')
156
156
  .gsub(/a_t_(skew)/, 'A-T \\1')
157
157
  .gsub(/x_content/, &:capitalize)
158
+ .gsub(/(^|_)([sl]su|a[an]i)(_|$)/, &:upcase)
159
+ .gsub(/^trna_/, 'tRNA ')
160
+ .gsub(/tRNA aa/, 'tRNA AA')
158
161
  .tr('_', ' ')
159
162
  end
160
163
 
@@ -38,13 +38,13 @@ module MiGA::Dataset::Base
38
38
  cds: '06.cds',
39
39
  # Annotation
40
40
  essential_genes: '07.annotation/01.function/01.essential',
41
- ssu: '07.annotation/01.function/02.ssu',
42
41
  mytaxa: '07.annotation/02.taxonomy/01.mytaxa',
43
42
  mytaxa_scan: '07.annotation/03.qa/02.mytaxa_scan',
44
43
  # Distances (for single-species datasets)
45
44
  taxonomy: '09.distances/05.taxonomy',
46
45
  distances: '09.distances',
47
- # General statistics
46
+ # Post-QC
47
+ ssu: '07.annotation/01.function/02.ssu',
48
48
  stats: '90.stats'
49
49
  }
50
50
 
@@ -72,8 +72,8 @@ module MiGA::Dataset::Base
72
72
  # Returns an Array of tasks to be executed before project-wide tasks
73
73
  @@PREPROCESSING_TASKS = [
74
74
  :raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
75
- :assembly, :cds, :essential_genes, :ssu, :mytaxa, :mytaxa_scan,
76
- :taxonomy, :distances, :stats
75
+ :assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
76
+ :taxonomy, :distances, :ssu, :stats
77
77
  ]
78
78
 
79
79
  ##
@@ -295,9 +295,11 @@ module MiGA::Dataset::Result
295
295
  r = add_files_to_ds_result(
296
296
  MiGA::Result.new("#{base}.json"), name,
297
297
  longest_ssu_gene: '.ssu.fa',
298
- gff: '.ssu.gff',
298
+ ssu_gff: '.ssu.gff', # DEPRECATED
299
+ gff: '.gff',
299
300
  all_ssu_genes: '.ssu.all.fa',
300
- classification: '.rdp.tsv'
301
+ classification: '.rdp.tsv',
302
+ trna_list: '.trna.txt'
301
303
  )
302
304
  opts[:is_clean] ||= false
303
305
  r.clean! if opts[:is_clean]
@@ -384,10 +386,7 @@ module MiGA::Dataset::Result
384
386
  ##
385
387
  # Add result type +:stats+ at +base+ (no +_opts+ supported)
386
388
  def add_result_stats(base, _opts)
387
- add_files_to_ds_result(
388
- MiGA::Result.new("#{base}.json"), name,
389
- trna_list: '.trna.txt'
390
- )
389
+ MiGA::Result.new("#{base}.json")
391
390
  end
392
391
 
393
392
  ##
@@ -159,18 +159,52 @@ module MiGA::Result::Stats
159
159
  end
160
160
 
161
161
  def compute_stats_ssu
162
- stats = { ssu: 0, complete_ssu: 0 }
162
+ stats = {
163
+ ssu: 0, complete_ssu: 0, ssu_fragment: 0.0,
164
+ lsu: 0, complete_lsu: 0, lsu_fragment: 0.0
165
+ }
166
+
163
167
  Zlib::GzipReader.open(file_path(:gff)) do |fh|
164
168
  fh.each_line do |ln|
165
169
  next if ln =~ /^#/
166
170
 
167
171
  rl = ln.chomp.split("\t")
168
- len = (rl[4].to_i - rl[3].to_i).abs + 1
169
- stats[:max_length] = [stats[:max_length] || 0, len].max
172
+ feat = Hash[rl[8].split(';').map { |i| i.split('=', 2) }]
173
+ subunit = feat['Name'] == '16S_rRNA' ? :ssu : :lsu
174
+ if subunit == :ssu
175
+ len = (rl[4].to_i - rl[3].to_i).abs + 1
176
+ stats[:max_length] = [stats[:max_length] || 0, len].max
177
+ end
170
178
  stats[:ssu] += 1
171
- stats[:complete_ssu] += 1 unless rl[8] =~ /\(partial\)/
179
+ if feat['product'] =~ /\(partial\)/
180
+ if feat['note'] =~ /aligned only (\d+) percent/
181
+ fragment = $1.to_f
182
+ stats[:"#{subunit}_fragment"] ||= [fragment, '%']
183
+ if fragment > stats[:"#{subunit}_fragment"][0]
184
+ stats[:"#{subunit}_fragment"][0] = fragment
185
+ end
186
+ end
187
+ else
188
+ stats[:"complete_#{subunit}"] += 1
189
+ stats[:"#{subunit}_fragment"] = [100.0, '%']
190
+ end
172
191
  end
173
192
  end
193
+
194
+ Zlib::GzipReader.open(file_path(:trna_list)) do |fh|
195
+ no = 0
196
+ stats[:trna_count] = 0
197
+ aa = {}
198
+ fh.each_line do |ln|
199
+ next if (no += 1) < 4
200
+ stats[:trna_count] += 1
201
+ row = ln.chomp.split("\t")
202
+ next if row[9] == 'pseudo' || row[4] == 'Undet'
203
+ aa[row[4].gsub(/^f?([A-Za-z]+)[0-9]?/, '\1')] = true
204
+ end
205
+ stats[:trna_aa] = aa.size
206
+ end if file_path(:trna_list)
207
+
174
208
  stats
175
209
  end
176
210
 
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.2, 12, 1].freeze
15
+ VERSION = [1.2, 13, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2022, 12, 30)
23
+ VERSION_DATE = Date.new(2022, 12, 31)
24
24
 
25
25
  ##
26
26
  # References of MiGA
data/scripts/ssu.bash CHANGED
@@ -13,36 +13,56 @@ miga date > "$DATASET.start"
13
13
 
14
14
  fa="../../../05.assembly/$DATASET.LargeContigs.fna"
15
15
  if [[ -s $fa ]] ; then
16
+ # Get domain
17
+ d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
18
+ if [[ "$d" != "Bacteria" && "$d" != "Archaea" && "$d" != "Eukaryota" ]] ; then
19
+ d="Bacteria" # Assume Bacteria in the absence of additional information
20
+ fi
21
+
16
22
  # Run barrnap
17
- barrnap --quiet --threads "$CORES" "$fa" | grep "^##gff\\|;product=16S " \
18
- > "$DATASET.ssu.gff"
23
+ dom_opt="$(echo "$d" | perl -ne 'print lc' | head -c 3)"
24
+ barrnap --quiet --kingdom "$dom_opt" --threads "$CORES" "$fa" \
25
+ > "${DATASET}.gff"
19
26
 
20
27
  # Extract
21
- bedtools getfasta -s "-fi" "$fa" -bed "$DATASET.ssu.gff" \
22
- -fo "$DATASET.ssu.all.fa"
23
- FastA.length.pl "$DATASET.ssu.all.fa" | sort -nr -k 2 | head -n 1 \
24
- | cut -f 1 > "$DATASET.ssu.fa.id"
25
- FastA.filter.pl "$DATASET.ssu.fa.id" "$DATASET.ssu.all.fa" > "$DATASET.ssu.fa"
26
- rm "$DATASET.ssu.fa.id"
27
- [[ -e "$fa.fai" ]] && rm "$fa.fai"
28
+ grep "^##gff\\|;product=16S " < "${DATASET}.gff" \
29
+ | bedtools getfasta -s "-fi" "$fa" -bed /dev/stdin \
30
+ -fo "${DATASET}.ssu.all.fa"
31
+ FastA.length.pl "${DATASET}.ssu.all.fa" | sort -nr -k 2 | head -n 1 \
32
+ | cut -f 1 > "${DATASET}.ssu.fa.id"
33
+ FastA.filter.pl "${DATASET}.ssu.fa.id" "${DATASET}.ssu.all.fa" \
34
+ > "${DATASET}.ssu.fa"
35
+ rm "${DATASET}.ssu.fa.id"
36
+ [[ -e "${fa}.fai" ]] && rm "${fa}.fai"
28
37
 
29
38
  # RDP classifier
30
- if [[ "$MIGA_RDP" == "yes" && -s "$DATASET.ssu.all.fa" ]] ; then
39
+ if [[ "$MIGA_RDP" == "yes" && -s "${DATASET}.ssu.all.fa" ]] ; then
31
40
  java -jar "$MIGA_HOME/.miga_db/classifier.jar" classify \
32
- -c 0.8 -f fixrank -g 16srrna -o "$DATASET.rdp.tsv" \
33
- "$DATASET.ssu.all.fa"
41
+ -c 0.8 -f fixrank -g 16srrna -o "${DATASET}.rdp.tsv" \
42
+ "${DATASET}.ssu.all.fa"
34
43
  echo "# Version: $(perl -pe 's/.*://' \
35
44
  < "$MIGA_HOME/.miga_db/classifier.version.txt" \
36
45
  | grep . | paste - - | perl -pe 's/\t/; /')" \
37
- >> "$DATASET.rdp.tsv"
46
+ >> "${DATASET}.rdp.tsv"
38
47
  fi
39
48
 
49
+ # tRNAscan-SE
50
+ dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
51
+ out="${DATASET}.trna.txt"
52
+ # `echo O` is to avoid a hang from a pre-existing output file.
53
+ # This is better than pre-checking (and removing), because it avoids
54
+ # the (unlikely) scenario of a file racing (e.g., a file created right
55
+ # before tRNAscan-SE starts, or a `rm` failure).
56
+ #
57
+ # The trailing `|| true` is to treat failure as non-fatal
58
+ echo O | tRNAscan-SE $dom_opt -o "${DATASET}.trna.txt" -q "$fa" || true
59
+
40
60
  # Gzip
41
- for x in ssu.gff ssu.all.fa rdp.tsv ; do
61
+ for x in gff ssu.all.fa rdp.tsv trna.txt ; do
42
62
  [[ -e "${DATASET}.${x}" ]] && gzip -9 -f "${DATASET}.${x}"
43
63
  done
44
64
  fi
45
65
 
46
66
  # Finalize
47
- miga date > "$DATASET.done"
67
+ miga date > "${DATASET}.done"
48
68
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
data/scripts/stats.bash CHANGED
@@ -11,33 +11,8 @@ cd "$DIR"
11
11
  # Initialize
12
12
  miga date > "${DATASET}.start"
13
13
 
14
- # tRNAscan-SE
15
- fa="../05.assembly/${DATASET}.LargeContigs.fna"
16
- if [[ -s "$fa" ]] ; then
17
- d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
18
- if [[ "$d" == "Bacteria" || "$d" == "Archaea" || "$d" == "Eukaryota" ]] ; then
19
- dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
20
- out="${DATASET}.trna.txt"
21
- # `echo O` is to avoid a hang from a pre-existing output file.
22
- # This is better than pre-checking (and removing), because it avoids
23
- # the (unlikely) scenario of a file racing (e.g., a file created right
24
- # before tRNAscan-SE starts, or a `rm` failure).
25
- #
26
- # The trailing `|| true` is to treat failure as non-fatal
27
- echo O | tRNAscan-SE $dom_opt -o "$out" -q "$fa" || true
28
- if [[ -s "$out" ]] ; then
29
- cnt=$(tail -n +4 "$out" | wc -l | awk '{print $1}')
30
- aa="$(tail -n +4 "$out" | grep -v 'pseudo$' | awk '{print $5}' \
31
- | grep -v 'Undet' | perl -pe 's/^f?([A-Za-z]+)[0-9]?/$1/' \
32
- | sort | uniq | wc -l | awk '{print $1}')"
33
- miga edit -P "$PROJECT" -D "$DATASET" \
34
- -m "trna_count=Int($cnt),trna_aa=Int($aa)"
35
- fi
36
- fi
37
- fi
38
-
39
14
  # Calculate statistics
40
- for i in raw_reads trimmed_fasta assembly cds essential_genes ssu distances taxonomy ; do
15
+ for i in raw_reads trimmed_fasta assembly cds essential_genes distances taxonomy ssu ; do
41
16
  echo "# $i"
42
17
  miga stats --compute-and-save --ignore-empty -P "$PROJECT" -D "$DATASET" -r $i
43
18
  done
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.12.1
4
+ version: 1.2.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-12-30 00:00:00.000000000 Z
11
+ date: 2022-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons