miga-base 1.2.12.1 → 1.2.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 176a9a20427a4b14db0418fb1140025d2a5dc3737e4e6c55926524bd4d0f4d32
4
- data.tar.gz: 4d1dca0ba71ea2ad86f4a4ffe5fbda8a8146f91464f092efe780ba92598bfd36
3
+ metadata.gz: 036b7181da39c88a4bf825230359f4efd9852b8aea488943028df6fde68184bf
4
+ data.tar.gz: 4584f52f0ea297120e23bec511850d2b2eeff7c3b76e1d013978f9a50ea5ea25
5
5
  SHA512:
6
- metadata.gz: 521cb14499220bcf5cec8d5a532f7e8fe2708eac6ca7bdd53e5ca5472857d0a57f08750a232cbbd2cf704b082295a6668ec275e0956fc08a783dda13b0cc0471
7
- data.tar.gz: 451d7f35d2e1666b3125fa58a85c171eb89310379a6acd67c1fd19bb0e1274e1cfe0a93775ccf42e93f03ea5f0e85b6045147fc8272b0d26e57bfdb97ecfa623
6
+ metadata.gz: a4b447b41e5647fba734c6b57a355f4889249f3265b085fb8e2277313731b830b0c1f7ea109e0895ba800405e054e5c945890dee924f645a52d6393d4455ab26
7
+ data.tar.gz: 9bc1e223e57db9ce9675994fa4e1a7bb4b1aa5509a296295ad8743a4b02e9ece7efdfaec27287bf397f2cd941ac57a3e87e9ac5597516f292ebda8d5dcd726e8
@@ -155,6 +155,9 @@ class String
155
155
  .gsub(/g_c_(skew)/, 'G-C \\1')
156
156
  .gsub(/a_t_(skew)/, 'A-T \\1')
157
157
  .gsub(/x_content/, &:capitalize)
158
+ .gsub(/(^|_)([sl]su|a[an]i)(_|$)/, &:upcase)
159
+ .gsub(/^trna_/, 'tRNA ')
160
+ .gsub(/tRNA aa/, 'tRNA AA')
158
161
  .tr('_', ' ')
159
162
  end
160
163
 
@@ -38,13 +38,13 @@ module MiGA::Dataset::Base
38
38
  cds: '06.cds',
39
39
  # Annotation
40
40
  essential_genes: '07.annotation/01.function/01.essential',
41
- ssu: '07.annotation/01.function/02.ssu',
42
41
  mytaxa: '07.annotation/02.taxonomy/01.mytaxa',
43
42
  mytaxa_scan: '07.annotation/03.qa/02.mytaxa_scan',
44
43
  # Distances (for single-species datasets)
45
44
  taxonomy: '09.distances/05.taxonomy',
46
45
  distances: '09.distances',
47
- # General statistics
46
+ # Post-QC
47
+ ssu: '07.annotation/01.function/02.ssu',
48
48
  stats: '90.stats'
49
49
  }
50
50
 
@@ -72,8 +72,8 @@ module MiGA::Dataset::Base
72
72
  # Returns an Array of tasks to be executed before project-wide tasks
73
73
  @@PREPROCESSING_TASKS = [
74
74
  :raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
75
- :assembly, :cds, :essential_genes, :ssu, :mytaxa, :mytaxa_scan,
76
- :taxonomy, :distances, :stats
75
+ :assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
76
+ :taxonomy, :distances, :ssu, :stats
77
77
  ]
78
78
 
79
79
  ##
@@ -295,9 +295,11 @@ module MiGA::Dataset::Result
295
295
  r = add_files_to_ds_result(
296
296
  MiGA::Result.new("#{base}.json"), name,
297
297
  longest_ssu_gene: '.ssu.fa',
298
- gff: '.ssu.gff',
298
+ ssu_gff: '.ssu.gff', # DEPRECATED
299
+ gff: '.gff',
299
300
  all_ssu_genes: '.ssu.all.fa',
300
- classification: '.rdp.tsv'
301
+ classification: '.rdp.tsv',
302
+ trna_list: '.trna.txt'
301
303
  )
302
304
  opts[:is_clean] ||= false
303
305
  r.clean! if opts[:is_clean]
@@ -384,10 +386,7 @@ module MiGA::Dataset::Result
384
386
  ##
385
387
  # Add result type +:stats+ at +base+ (no +_opts+ supported)
386
388
  def add_result_stats(base, _opts)
387
- add_files_to_ds_result(
388
- MiGA::Result.new("#{base}.json"), name,
389
- trna_list: '.trna.txt'
390
- )
389
+ MiGA::Result.new("#{base}.json")
391
390
  end
392
391
 
393
392
  ##
@@ -159,18 +159,51 @@ module MiGA::Result::Stats
159
159
  end
160
160
 
161
161
  def compute_stats_ssu
162
- stats = { ssu: 0, complete_ssu: 0 }
162
+ stats = {
163
+ ssu: 0, complete_ssu: 0, ssu_fragment: [0.0, '%'],
164
+ lsu: 0, complete_lsu: 0, lsu_fragment: [0.0, '%']
165
+ }
166
+
163
167
  Zlib::GzipReader.open(file_path(:gff)) do |fh|
164
168
  fh.each_line do |ln|
165
169
  next if ln =~ /^#/
166
170
 
167
171
  rl = ln.chomp.split("\t")
168
- len = (rl[4].to_i - rl[3].to_i).abs + 1
169
- stats[:max_length] = [stats[:max_length] || 0, len].max
172
+ feat = Hash[rl[8].split(';').map { |i| i.split('=', 2) }]
173
+ subunit = feat['Name'] == '16S_rRNA' ? :ssu : :lsu
174
+ if subunit == :ssu
175
+ len = (rl[4].to_i - rl[3].to_i).abs + 1
176
+ stats[:max_length] = [stats[:max_length] || 0, len].max
177
+ end
170
178
  stats[:ssu] += 1
171
- stats[:complete_ssu] += 1 unless rl[8] =~ /\(partial\)/
179
+ if feat['product'] =~ /\(partial\)/
180
+ if feat['note'] =~ /aligned only (\d+) percent/
181
+ fragment = $1.to_f
182
+ if fragment > stats[:"#{subunit}_fragment"][0]
183
+ stats[:"#{subunit}_fragment"][0] = fragment
184
+ end
185
+ end
186
+ else
187
+ stats[:"complete_#{subunit}"] += 1
188
+ stats[:"#{subunit}_fragment"] = [100.0, '%']
189
+ end
172
190
  end
173
191
  end
192
+
193
+ Zlib::GzipReader.open(file_path(:trna_list)) do |fh|
194
+ no = 0
195
+ stats[:trna_count] = 0
196
+ aa = {}
197
+ fh.each_line do |ln|
198
+ next if (no += 1) < 4
199
+ stats[:trna_count] += 1
200
+ row = ln.chomp.split("\t")
201
+ next if row[9] == 'pseudo' || row[4] == 'Undet'
202
+ aa[row[4].gsub(/^f?([A-Za-z]+)[0-9]?/, '\1')] = true
203
+ end
204
+ stats[:trna_aa] = aa.size
205
+ end if file_path(:trna_list)
206
+
174
207
  stats
175
208
  end
176
209
 
data/lib/miga/version.rb CHANGED
@@ -12,7 +12,7 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.2, 12, 1].freeze
15
+ VERSION = [1.2, 13, 1].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
@@ -20,7 +20,7 @@ module MiGA
20
20
 
21
21
  ##
22
22
  # Date of the current gem relese.
23
- VERSION_DATE = Date.new(2022, 12, 30)
23
+ VERSION_DATE = Date.new(2022, 12, 31)
24
24
 
25
25
  ##
26
26
  # References of MiGA
data/scripts/ssu.bash CHANGED
@@ -13,36 +13,56 @@ miga date > "$DATASET.start"
13
13
 
14
14
  fa="../../../05.assembly/$DATASET.LargeContigs.fna"
15
15
  if [[ -s $fa ]] ; then
16
+ # Get domain
17
+ d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
18
+ if [[ "$d" != "Bacteria" && "$d" != "Archaea" && "$d" != "Eukaryota" ]] ; then
19
+ d="Bacteria" # Assume Bacteria in the absence of additional information
20
+ fi
21
+
16
22
  # Run barrnap
17
- barrnap --quiet --threads "$CORES" "$fa" | grep "^##gff\\|;product=16S " \
18
- > "$DATASET.ssu.gff"
23
+ dom_opt="$(echo "$d" | perl -ne 'print lc' | head -c 3)"
24
+ barrnap --quiet --kingdom "$dom_opt" --threads "$CORES" "$fa" \
25
+ > "${DATASET}.gff"
19
26
 
20
27
  # Extract
21
- bedtools getfasta -s "-fi" "$fa" -bed "$DATASET.ssu.gff" \
22
- -fo "$DATASET.ssu.all.fa"
23
- FastA.length.pl "$DATASET.ssu.all.fa" | sort -nr -k 2 | head -n 1 \
24
- | cut -f 1 > "$DATASET.ssu.fa.id"
25
- FastA.filter.pl "$DATASET.ssu.fa.id" "$DATASET.ssu.all.fa" > "$DATASET.ssu.fa"
26
- rm "$DATASET.ssu.fa.id"
27
- [[ -e "$fa.fai" ]] && rm "$fa.fai"
28
+ grep "^##gff\\|;product=16S " < "${DATASET}.gff" \
29
+ | bedtools getfasta -s "-fi" "$fa" -bed /dev/stdin \
30
+ -fo "${DATASET}.ssu.all.fa"
31
+ FastA.length.pl "${DATASET}.ssu.all.fa" | sort -nr -k 2 | head -n 1 \
32
+ | cut -f 1 > "${DATASET}.ssu.fa.id"
33
+ FastA.filter.pl "${DATASET}.ssu.fa.id" "${DATASET}.ssu.all.fa" \
34
+ > "${DATASET}.ssu.fa"
35
+ rm "${DATASET}.ssu.fa.id"
36
+ [[ -e "${fa}.fai" ]] && rm "${fa}.fai"
28
37
 
29
38
  # RDP classifier
30
- if [[ "$MIGA_RDP" == "yes" && -s "$DATASET.ssu.all.fa" ]] ; then
39
+ if [[ "$MIGA_RDP" == "yes" && -s "${DATASET}.ssu.all.fa" ]] ; then
31
40
  java -jar "$MIGA_HOME/.miga_db/classifier.jar" classify \
32
- -c 0.8 -f fixrank -g 16srrna -o "$DATASET.rdp.tsv" \
33
- "$DATASET.ssu.all.fa"
41
+ -c 0.8 -f fixrank -g 16srrna -o "${DATASET}.rdp.tsv" \
42
+ "${DATASET}.ssu.all.fa"
34
43
  echo "# Version: $(perl -pe 's/.*://' \
35
44
  < "$MIGA_HOME/.miga_db/classifier.version.txt" \
36
45
  | grep . | paste - - | perl -pe 's/\t/; /')" \
37
- >> "$DATASET.rdp.tsv"
46
+ >> "${DATASET}.rdp.tsv"
38
47
  fi
39
48
 
49
+ # tRNAscan-SE
50
+ dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
51
+ out="${DATASET}.trna.txt"
52
+ # `echo O` is to avoid a hang from a pre-existing output file.
53
+ # This is better than pre-checking (and removing), because it avoids
54
+ # the (unlikely) scenario of a file racing (e.g., a file created right
55
+ # before tRNAscan-SE starts, or a `rm` failure).
56
+ #
57
+ # The trailing `|| true` is to treat failure as non-fatal
58
+ echo O | tRNAscan-SE $dom_opt -o "${DATASET}.trna.txt" -q "$fa" || true
59
+
40
60
  # Gzip
41
- for x in ssu.gff ssu.all.fa rdp.tsv ; do
61
+ for x in gff ssu.all.fa rdp.tsv trna.txt ; do
42
62
  [[ -e "${DATASET}.${x}" ]] && gzip -9 -f "${DATASET}.${x}"
43
63
  done
44
64
  fi
45
65
 
46
66
  # Finalize
47
- miga date > "$DATASET.done"
67
+ miga date > "${DATASET}.done"
48
68
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
data/scripts/stats.bash CHANGED
@@ -11,33 +11,8 @@ cd "$DIR"
11
11
  # Initialize
12
12
  miga date > "${DATASET}.start"
13
13
 
14
- # tRNAscan-SE
15
- fa="../05.assembly/${DATASET}.LargeContigs.fna"
16
- if [[ -s "$fa" ]] ; then
17
- d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
18
- if [[ "$d" == "Bacteria" || "$d" == "Archaea" || "$d" == "Eukaryota" ]] ; then
19
- dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
20
- out="${DATASET}.trna.txt"
21
- # `echo O` is to avoid a hang from a pre-existing output file.
22
- # This is better than pre-checking (and removing), because it avoids
23
- # the (unlikely) scenario of a file racing (e.g., a file created right
24
- # before tRNAscan-SE starts, or a `rm` failure).
25
- #
26
- # The trailing `|| true` is to treat failure as non-fatal
27
- echo O | tRNAscan-SE $dom_opt -o "$out" -q "$fa" || true
28
- if [[ -s "$out" ]] ; then
29
- cnt=$(tail -n +4 "$out" | wc -l | awk '{print $1}')
30
- aa="$(tail -n +4 "$out" | grep -v 'pseudo$' | awk '{print $5}' \
31
- | grep -v 'Undet' | perl -pe 's/^f?([A-Za-z]+)[0-9]?/$1/' \
32
- | sort | uniq | wc -l | awk '{print $1}')"
33
- miga edit -P "$PROJECT" -D "$DATASET" \
34
- -m "trna_count=Int($cnt),trna_aa=Int($aa)"
35
- fi
36
- fi
37
- fi
38
-
39
14
  # Calculate statistics
40
- for i in raw_reads trimmed_fasta assembly cds essential_genes ssu distances taxonomy ; do
15
+ for i in raw_reads trimmed_fasta assembly cds essential_genes distances taxonomy ssu ; do
41
16
  echo "# $i"
42
17
  miga stats --compute-and-save --ignore-empty -P "$PROJECT" -D "$DATASET" -r $i
43
18
  done
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.12.1
4
+ version: 1.2.13.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-12-30 00:00:00.000000000 Z
11
+ date: 2022-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons