miga-base 0.7.26.3 → 1.0.0.sr1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/doctor.rb +50 -19
  7. data/lib/miga/cli/action/doctor/base.rb +20 -18
  8. data/lib/miga/cli/action/init.rb +11 -7
  9. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  10. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  11. data/lib/miga/cli/action/tax_dist.rb +2 -2
  12. data/lib/miga/cli/action/wf.rb +5 -4
  13. data/lib/miga/daemon.rb +11 -4
  14. data/lib/miga/dataset/result.rb +10 -6
  15. data/lib/miga/json.rb +1 -2
  16. data/lib/miga/metadata.rb +5 -1
  17. data/lib/miga/parallel.rb +11 -6
  18. data/lib/miga/project.rb +8 -8
  19. data/lib/miga/project/base.rb +4 -4
  20. data/lib/miga/project/result.rb +2 -2
  21. data/lib/miga/sqlite.rb +7 -0
  22. data/lib/miga/version.rb +23 -9
  23. data/scripts/aai_distances.bash +16 -18
  24. data/scripts/ani_distances.bash +16 -17
  25. data/scripts/assembly.bash +31 -16
  26. data/scripts/haai_distances.bash +3 -27
  27. data/scripts/miga.bash +6 -4
  28. data/scripts/p.bash +1 -1
  29. data/scripts/read_quality.bash +9 -18
  30. data/scripts/trimmed_fasta.bash +14 -30
  31. data/scripts/trimmed_reads.bash +36 -36
  32. data/test/parallel_test.rb +31 -0
  33. data/test/project_test.rb +2 -1
  34. data/utils/distance/commands.rb +1 -0
  35. data/utils/distance/runner.rb +2 -4
  36. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  37. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  38. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  39. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  41. data/utils/enveomics/Manifest/categories.json +13 -4
  42. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  43. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  44. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  45. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  46. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  47. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  48. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  49. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  50. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  51. data/utils/enveomics/Scripts/aai.rb +3 -2
  52. data/utils/enveomics/Scripts/anir.rb +137 -0
  53. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  54. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  55. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  56. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  57. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  58. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  59. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  64. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  65. data/utils/enveomics/Scripts/rbm.rb +87 -133
  66. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  67. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  68. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  69. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  70. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  71. data/utils/enveomics/enveomics.R/README.md +1 -0
  72. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  73. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  74. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  75. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  76. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  77. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  78. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  79. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  80. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  81. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  82. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  83. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  84. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  93. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  94. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  95. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  96. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  97. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  98. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  99. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  100. data/utils/multitrim/README.md +67 -0
  101. data/utils/multitrim/multitrim.py +1555 -0
  102. data/utils/multitrim/multitrim.yml +13 -0
  103. data/utils/requirements.txt +4 -3
  104. metadata +33 -6
  105. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -1,92 +1,100 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # @author Luis M. Rodriguez-R
4
- # @license artistic license 2.0
3
+ # frozen_string_literal: true
5
4
 
6
- $:.push File.expand_path("../lib", __FILE__)
7
- require "enveomics_rb/enveomics"
8
- require "enveomics_rb/stat"
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ require 'enveomics_rb/stats'
8
+ $VERSION = 1.0
9
9
 
10
- o = {q:false, completeness:nil, minlen:500, shuffle:true}
10
+ o = { q: false, completeness: nil, minlen: 500, shuffle: true }
11
11
  OptionParser.new do |opts|
12
- opts.banner = "
13
- Simulates incomplete (fragmented) drafts from complete genomes.
12
+ opts.version = $VERSION
13
+ Enveomics.opt_banner(
14
+ opts, 'Simulates incomplete (fragmented) drafts from complete genomes',
15
+ "#{File.basename($0)} -i in.fasta -o out.fasta -c 0.5 [options]"
16
+ )
14
17
 
15
- Usage: #{$0} [options]"
16
- opts.separator ""
17
- opts.separator "Mandatory"
18
- opts.on("-i", "--in FILE",
19
- "Path to the FastA file containing the complete sequences."
20
- ){ |v| o[:in] = v }
21
- opts.on("-o", "--out FILE", "Path to the FastA to create."){ |v| o[:out] = v }
22
- opts.on("-c", "--completeness FLOAT",
23
- "Fraction of genome completeness to simulate from 0 to 1."
24
- ){ |v| o[:completeness] = v.to_f }
25
- opts.separator ""
26
- opts.separator "Options"
27
- opts.on("-m", "--minlen INT",
28
- "Minimum fragment length to report. By default: #{o[:minlen]}."
29
- ){ |v| o[:minlen] = v.to_i }
30
- opts.on("-s", "--sorted", "Keep fragments sorted as in the input file. ",
31
- "By default, fragments are shuffled."){ |v| o[:shuffle] = !v }
32
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = true }
33
- opts.on("-h", "--help", "Display this screen") do
34
- puts opts
35
- exit
36
- end
37
- opts.separator ""
18
+ opts.separator 'Mandatory'
19
+ opts.on(
20
+ '-i', '--in FILE',
21
+ 'Path to the FastA file containing the complete sequences',
22
+ 'Supports compression with .gz extension, use - for STDIN'
23
+ ) { |v| o[:in] = v }
24
+ opts.on(
25
+ '-o', '--out FILE', 'Path to the FastA to create',
26
+ 'Supports compression with .gz extension, use - for STDOUT'
27
+ ) { |v| o[:out] = v }
28
+ opts.on(
29
+ '-c', '--completeness FLOAT',
30
+ 'Fraction of genome completeness to simulate from 0 to 1'
31
+ ) { |v| o[:completeness] = v.to_f }
32
+
33
+ opts.separator ''
34
+ opts.separator 'Options'
35
+ opts.on(
36
+ '-m', '--minlen INT',
37
+ "Minimum fragment length to report. By default: #{o[:minlen]}"
38
+ ) { |v| o[:minlen] = v.to_i }
39
+ opts.on(
40
+ '-s', '--sorted', 'Keep fragments sorted as in the input file',
41
+ 'By default, fragments are shuffled'
42
+ ) { |v| o[:shuffle] = !v }
43
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
44
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
45
+ opts.separator ''
38
46
  end.parse!
39
- abort "-i is mandatory" if o[:in].nil?
40
- abort "-o is mandatory" if o[:out].nil?
41
- abort "-c is mandatory" if o[:completeness].nil?
47
+
48
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
49
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
50
+ raise Enveomics::OptionError.new('-c is mandatory') if o[:completeness].nil?
42
51
 
43
52
  begin
44
53
  # Read input sequences
45
54
  g_id = []
46
55
  g_seq = []
47
- File.open(o[:in], "r") do |ifh|
48
- id = ""
49
- ifh.each_line do |ln|
50
- if ln =~ /^>(\S*)/
51
- g_id << $1
52
- g_seq << ""
53
- else
54
- g_seq[g_seq.size-1] += ln.gsub(/[^A-Za-z]/,"")
55
- end
56
+ ifh = reader(o[:in])
57
+ id = ''
58
+ ifh.each_line do |ln|
59
+ if ln =~ /^>(\S*)/
60
+ g_id << $1
61
+ g_seq << ''
62
+ else
63
+ g_seq[g_seq.size - 1] += ln.gsub(/[^A-Za-z]/, '')
56
64
  end
57
65
  end
58
-
66
+ ifh.close
67
+
59
68
  # Fragment genomes
60
69
  f = {}
61
70
  binlen = [1, (o[:minlen].to_f/(1.5**2)).ceil].max
62
71
  p = [0.001, [1.0, 1.0 - (o[:completeness]/1.25 + 0.1)].min].max
63
- while not g_seq.empty?
72
+ while !g_seq.empty?
64
73
  id = g_id.shift
65
74
  seq = g_seq.shift
66
75
  gL = seq.length
67
- while not seq.empty?
68
- fL = [0, ((Enve::Stat.r_geom(p).to_f +
69
- Enve::Stat.r_unif(-0.5,0.5))*binlen).round].max
70
- f["#{f.size+1}_#{id}"] = seq[0,fL] if fL >= o[:minlen]
71
- seq = seq[(fL+1) .. -1]
72
- seq = "" if seq.nil?
76
+ while !seq.empty?
77
+ rand_x =
78
+ Enveomics::Stats.r_geom(p).to_f + Enveomics::Stats.r_unif(-0.5, 0.5)
79
+ fL = [0, (rand_x * binlen).round].max
80
+ f["#{f.size+1}_#{id}"] = seq[0, fL] if fL >= o[:minlen]
81
+ seq = seq[(fL + 1) .. -1]
82
+ seq = '' if seq.nil?
73
83
  end
74
84
  end
75
85
 
76
86
  # Save output
77
87
  k = f.keys
78
88
  k.shuffle! if o[:shuffle]
79
- File.open(o[:out], "w") do |ofh|
80
- k.each do |id|
81
- ofh.puts ">#{id}"
82
- ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
83
- end
89
+ ofh = writer(o[:out])
90
+ k.each do |id|
91
+ ofh.puts ">#{id}"
92
+ ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
84
93
  end
85
-
94
+ ofh.close
86
95
  rescue => err
87
- $stderr.puts "Exception: #{err}\n\n"
88
- err.backtrace.each { |l| $stderr.puts l + "\n" }
89
- err
96
+ $stderr.puts "Exception: #{err}\n\n"
97
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
98
+ err
90
99
  end
91
100
 
92
-
@@ -1,43 +1,57 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
- # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
- # @license Artistic-2.0
6
- #
3
+ # frozen_string_literal: false
7
4
 
8
- require 'optparse'
5
+ $VERSION = 1.0
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
9
8
 
10
- o = {q: false, rep: false}
11
- ARGV << '-h' if ARGV.size==0
9
+ o = { q: false, rep: false }
12
10
 
13
11
  OptionParser.new do |opt|
14
- opt.banner = "
15
- Samples a random set of sequences from a multi-FastA file.
16
-
17
- Usage: #{$0} [options]"
18
- opt.separator ''
12
+ Enveomics.opt_banner(
13
+ opt, 'Samples a random set of sequences from a multi-FastA file',
14
+ "#{File.basename($0)} -i seq.fa -o 10pc.fa -f 0.1 [options]"
15
+ )
19
16
  opt.separator 'Mandatory'
20
- opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
21
- opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
22
- opt.on('-f', '--fraction FLOAT',
17
+ opt.on(
18
+ '-i', '--in PATH',
19
+ 'Input FastA file',
20
+ 'Supports compression with .gz extension, use - for STDIN'
21
+ ) { |v| o[:i] = v }
22
+ opt.on(
23
+ '-o', '--out PATH',
24
+ 'Output FastA file',
25
+ 'Supports compression with .gz extension, use - for STDOUT'
26
+ ) { |v| o[:o] = v }
27
+ opt.on(
28
+ '-f', '--fraction FLOAT', Float,
23
29
  'Fraction of sequences to sample [0-1].',
24
- 'Mandatory unless -n is provided.'){ |v| o[:f] = v.to_f }
30
+ 'Mandatory unless -c is provided.'
31
+ ) { |v| o[:f] = v }
25
32
  opt.separator ''
33
+
26
34
  opt.separator 'Options'
27
- opt.on('-c', '--number INT',
28
- 'Number of sequences to sample.',
29
- 'Mandatory unless -f is provided.'){ |v| o[:n] = v.to_i }
30
- opt.on('-r', '--replacement','Sample with replacement'){ |v| o[:rep] = v }
31
- opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
35
+ opt.on(
36
+ '-c', '--number INT', Integer,
37
+ 'Number of sequences to sample',
38
+ 'Mandatory unless -f is provided'
39
+ ) { |v| o[:n] = v }
40
+ opt.on('-r', '--replacement','Sample with replacement') { |v| o[:rep] = v }
41
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
32
42
  opt.on('-h', '--help', 'Display this screen.') do
33
43
  puts opt
34
44
  exit
35
45
  end
36
46
  opt.separator ''
37
47
  end.parse!
38
- abort '-i is mandatory.' if o[:i].nil?
39
- abort '-o is mandatory.' if o[:o].nil?
40
- abort '-f or -n is mandatory.' if o[:f].nil? and o[:n].nil?
48
+
49
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
50
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:o].nil?
51
+ if o[:f].nil? && o[:n].nil?
52
+ raise Enveomics::OptionError.new('-f or -n is mandatory')
53
+ end
54
+ $QUIET = o[:q]
41
55
 
42
56
  # Functions to parse sequences
43
57
  def do_stuff(id, sq)
@@ -53,31 +67,32 @@ def do_stuff(id, sq)
53
67
  end
54
68
 
55
69
  # Parse sequences
56
- $stderr.puts 'Parsing sequences' unless o[:q]
70
+ say 'Parsing sequences'
57
71
  seq = []
58
- File.open(o[:i], 'r') do |fh|
59
- id = nil
60
- sq = ''
61
- fh.each do |ln|
62
- next if ln =~ /^;/
63
- if ln =~ /^>(.+)/
64
- seq << [id, sq] unless id.nil?
65
- id = $1
66
- sq = ''
67
- else
68
- sq << ln
69
- end
72
+ fh = reader(o[:i])
73
+ id = nil
74
+ sq = ''
75
+ fh.each do |ln|
76
+ next if ln =~ /^;/
77
+ if ln =~ /^>(.+)/
78
+ seq << [id, sq] unless id.nil?
79
+ id = $1
80
+ sq = ''
81
+ else
82
+ sq << ln
70
83
  end
71
- seq << [id, sq] unless id.nil?
72
84
  end
73
- $stderr.puts " Input sequences: #{seq.size}"
85
+ seq << [id, sq] unless id.nil?
86
+ fh.close
87
+ say "Input sequences: #{seq.size}"
88
+
74
89
  o[:n] ||= (seq.size * o[:f]).round
75
- seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
76
- File.open(o[:o], 'w') do |fh|
77
- seq_o.each do |i|
78
- fh.puts ">#{i[0]}"
79
- fh.puts i[1]
80
- end
90
+ seq_o = o[:rep] ? o[:n].times.map { seq.sample } : seq.sample(o[:n])
91
+ fh = writer(o[:o])
92
+ seq_o.each do |i|
93
+ fh.puts ">#{i[0]}"
94
+ fh.puts i[1]
81
95
  end
82
- $stderr.puts " Output sequences: #{seq_o.size}"
96
+ fh.close
97
+ say "Output sequences: #{seq_o.size}"
83
98
 
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'zlib'
5
+
6
+ o = { qual: 31, encoding: 33 }
7
+ ARGV << '-h' if ARGV.empty?
8
+ OptionParser.new do |opts|
9
+ opts.banner = "
10
+ Creates a FastQ-compliant file from a FastA file.
11
+
12
+ Usage: #{$0} [options]"
13
+ opts.separator ''
14
+ opts.separator 'Options'
15
+ opts.on(
16
+ '-i', '--in FILE', 'Input FastA file (supports .gz compression)'
17
+ ) { |v| o[:in] = v }
18
+ opts.on(
19
+ '-o', '--out FILE', 'Output FastQ file (supports .gz compression)'
20
+ ) { |v| o[:out] = v }
21
+ opts.on(
22
+ '-q', '--quality INT', Integer,
23
+ 'PHRED quality score to use (fixed), in the range [-5, 41]',
24
+ "By default: #{o[:qual]}"
25
+ ) { |v| o[:qual] = v }
26
+ opts.on(
27
+ '--encoding INT', Integer,
28
+ "Base encoding (33 or 64). By default: #{o[:encoding]}"
29
+ ) { |v| o[:encoding] = v }
30
+ opts.on('-h', '--help', 'Display this screen.') do
31
+ puts opts
32
+ exit
33
+ end
34
+ opts.separator ''
35
+ end.parse!
36
+ abort '-i is mandatory' if o[:in].nil?
37
+ abort '-o is mandatory' if o[:out].nil?
38
+ abort '-q must be in the range -5 .. 41' if o[:qual] < -5 || o[:qual] > 41
39
+
40
+ # Determine quality character
41
+ $qchar = (o[:qual] + o[:encoding]).chr
42
+
43
+ # Create file handlers
44
+ ifh = o[:in] =~ /\.gz$/ ?
45
+ Zlib::GzipReader.open(o[:in]) : File.open(o[:in], 'r')
46
+ ofh = o[:out] =~ /\.gz$/ ?
47
+ Zlib::GzipWriter.open(o[:out]) : File.open(o[:out], 'w')
48
+
49
+ def print_seq(ofh, id, seq)
50
+ ofh.puts "@#{id}", seq, '+', $qchar * seq.length unless seq.empty?
51
+ end
52
+
53
+ # Generate FastQ
54
+ id = ''
55
+ seq = ''
56
+ ifh.each_line do |ln|
57
+ next if ln =~ /^;/
58
+ if ln =~ /^>(.*)/
59
+ print_seq(ofh, id, seq)
60
+ seq = ''
61
+ id = $1
62
+ else
63
+ seq += ln.chomp.upcase.gsub(/[^A-Z]/,'')
64
+ end
65
+ end
66
+ print_seq(ofh, id, seq)
67
+ ofh.close
68
+ ifh.close
69
+
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $VERSION = 1.2
4
+ $:.push File.expand_path('../lib', __FILE__)
5
+ require 'enveomics_rb/enveomics'
6
+
7
+ o = { q: false, offset: 33, qual: 15, fasta: false }
8
+ OptionParser.new do |opts|
9
+ opts.version = $VERSION
10
+ Enveomics.opt_banner(
11
+ opts, 'Masks low-quality bases in a FastQ file',
12
+ "#{File.basename($0)} -i in.fastq -o out.fastq [options]"
13
+ )
14
+
15
+ opts.separator 'Mandatory'
16
+ opts.on(
17
+ '-i', '--input FILE',
18
+ 'Path to the FastQ file containing the sequences',
19
+ 'Supports compression with .gz extension, use - for STDIN'
20
+ ) { |v| o[:in] = v }
21
+ opts.on(
22
+ '-o', '--out FILE',
23
+ 'Path to the output FastQ file',
24
+ 'Supports compression with .gz extension, use - for STDOUT'
25
+ ) { |v| o[:out] = v }
26
+
27
+ opts.separator ''
28
+ opts.separator 'Quality Options'
29
+ opts.on(
30
+ '-q', '--qual INT', Integer,
31
+ "Minimum quality score to allow a base, by default: #{o[:qual]}"
32
+ ) { |v| o[:qual] = v }
33
+ opts.on(
34
+ '--offset INT', Integer,
35
+ "Q-score offset, by default: #{o[:offset]}"
36
+ ) { |v| o[:offset] = v }
37
+
38
+ opts.separator ''
39
+ opts.separator 'Other Options'
40
+ opts.on(
41
+ '-a', '--fasta', 'Output sequences in FastA format'
42
+ ) { |v| o[:fasta] = v }
43
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
44
+ opts.on('-h', '--help', 'Display this screen') do
45
+ puts opts
46
+ exit
47
+ end
48
+ opts.separator ''
49
+ end.parse!
50
+
51
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
52
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
53
+ $QUIET = o[:q]
54
+
55
+ # Open in/out files
56
+ say 'Reading FastQ file'
57
+ ifh = reader(o[:in])
58
+ ofh = writer(o[:out])
59
+
60
+ # Parse and mask
61
+ entry = []
62
+ lno = 0
63
+ ifh.each_line do |ln|
64
+ lno += 1 # <- Gzip doesn't support $.
65
+ case lno % 4
66
+ when 1
67
+ ln =~ /^@(\S+)/ or
68
+ raise Enveomics::ParseError.new("Unexpected defline format: #{ln}")
69
+ entry << ln
70
+ when 2, 3
71
+ entry << ln
72
+ when 0
73
+ entry << ln
74
+ q = entry[3].chomp.split('').map { |i| (i.ord - o[:offset]) }
75
+ q.map { |i| i < o[:qual] }.each_with_index { |i, k| entry[1][k] = 'N' if i }
76
+ ofh.puts(o[:fasta] ? [entry[0].gsub(/^@/, '>'), entry[1]] : entry)
77
+ entry = []
78
+ end
79
+ end
80
+
81
+ # Finalize
82
+ say " Lines: #{lno}"
83
+ unless entry.empty?
84
+ raise Enveomics::ParseError.new('Unexpected trailing lines in FastQ')
85
+ end
86
+ say " Sequences: #{lno / 4}"
87
+ ifh.close
88
+ ofh.close
89
+