miga-base 0.7.26.3 → 1.0.0.sr1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/doctor.rb +50 -19
  7. data/lib/miga/cli/action/doctor/base.rb +20 -18
  8. data/lib/miga/cli/action/init.rb +11 -7
  9. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  10. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  11. data/lib/miga/cli/action/tax_dist.rb +2 -2
  12. data/lib/miga/cli/action/wf.rb +5 -4
  13. data/lib/miga/daemon.rb +11 -4
  14. data/lib/miga/dataset/result.rb +10 -6
  15. data/lib/miga/json.rb +1 -2
  16. data/lib/miga/metadata.rb +5 -1
  17. data/lib/miga/parallel.rb +11 -6
  18. data/lib/miga/project.rb +8 -8
  19. data/lib/miga/project/base.rb +4 -4
  20. data/lib/miga/project/result.rb +2 -2
  21. data/lib/miga/sqlite.rb +7 -0
  22. data/lib/miga/version.rb +23 -9
  23. data/scripts/aai_distances.bash +16 -18
  24. data/scripts/ani_distances.bash +16 -17
  25. data/scripts/assembly.bash +31 -16
  26. data/scripts/haai_distances.bash +3 -27
  27. data/scripts/miga.bash +6 -4
  28. data/scripts/p.bash +1 -1
  29. data/scripts/read_quality.bash +9 -18
  30. data/scripts/trimmed_fasta.bash +14 -30
  31. data/scripts/trimmed_reads.bash +36 -36
  32. data/test/parallel_test.rb +31 -0
  33. data/test/project_test.rb +2 -1
  34. data/utils/distance/commands.rb +1 -0
  35. data/utils/distance/runner.rb +2 -4
  36. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  37. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  38. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  39. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  41. data/utils/enveomics/Manifest/categories.json +13 -4
  42. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  43. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  44. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  45. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  46. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  47. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  48. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  49. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  50. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  51. data/utils/enveomics/Scripts/aai.rb +3 -2
  52. data/utils/enveomics/Scripts/anir.rb +137 -0
  53. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  54. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  55. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  56. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  57. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  58. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  59. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  64. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  65. data/utils/enveomics/Scripts/rbm.rb +87 -133
  66. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  67. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  68. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  69. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  70. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  71. data/utils/enveomics/enveomics.R/README.md +1 -0
  72. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  73. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  74. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  75. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  76. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  77. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  78. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  79. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  80. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  81. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  82. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  83. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  84. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  93. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  94. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  95. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  96. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  97. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  98. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  99. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  100. data/utils/multitrim/README.md +67 -0
  101. data/utils/multitrim/multitrim.py +1555 -0
  102. data/utils/multitrim/multitrim.yml +13 -0
  103. data/utils/requirements.txt +4 -3
  104. metadata +33 -6
  105. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -1,92 +1,100 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # @author Luis M. Rodriguez-R
4
- # @license artistic license 2.0
3
+ # frozen_string_literal: true
5
4
 
6
- $:.push File.expand_path("../lib", __FILE__)
7
- require "enveomics_rb/enveomics"
8
- require "enveomics_rb/stat"
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ require 'enveomics_rb/stats'
8
+ $VERSION = 1.0
9
9
 
10
- o = {q:false, completeness:nil, minlen:500, shuffle:true}
10
+ o = { q: false, completeness: nil, minlen: 500, shuffle: true }
11
11
  OptionParser.new do |opts|
12
- opts.banner = "
13
- Simulates incomplete (fragmented) drafts from complete genomes.
12
+ opts.version = $VERSION
13
+ Enveomics.opt_banner(
14
+ opts, 'Simulates incomplete (fragmented) drafts from complete genomes',
15
+ "#{File.basename($0)} -i in.fasta -o out.fasta -c 0.5 [options]"
16
+ )
14
17
 
15
- Usage: #{$0} [options]"
16
- opts.separator ""
17
- opts.separator "Mandatory"
18
- opts.on("-i", "--in FILE",
19
- "Path to the FastA file containing the complete sequences."
20
- ){ |v| o[:in] = v }
21
- opts.on("-o", "--out FILE", "Path to the FastA to create."){ |v| o[:out] = v }
22
- opts.on("-c", "--completeness FLOAT",
23
- "Fraction of genome completeness to simulate from 0 to 1."
24
- ){ |v| o[:completeness] = v.to_f }
25
- opts.separator ""
26
- opts.separator "Options"
27
- opts.on("-m", "--minlen INT",
28
- "Minimum fragment length to report. By default: #{o[:minlen]}."
29
- ){ |v| o[:minlen] = v.to_i }
30
- opts.on("-s", "--sorted", "Keep fragments sorted as in the input file. ",
31
- "By default, fragments are shuffled."){ |v| o[:shuffle] = !v }
32
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = true }
33
- opts.on("-h", "--help", "Display this screen") do
34
- puts opts
35
- exit
36
- end
37
- opts.separator ""
18
+ opts.separator 'Mandatory'
19
+ opts.on(
20
+ '-i', '--in FILE',
21
+ 'Path to the FastA file containing the complete sequences',
22
+ 'Supports compression with .gz extension, use - for STDIN'
23
+ ) { |v| o[:in] = v }
24
+ opts.on(
25
+ '-o', '--out FILE', 'Path to the FastA to create',
26
+ 'Supports compression with .gz extension, use - for STDOUT'
27
+ ) { |v| o[:out] = v }
28
+ opts.on(
29
+ '-c', '--completeness FLOAT',
30
+ 'Fraction of genome completeness to simulate from 0 to 1'
31
+ ) { |v| o[:completeness] = v.to_f }
32
+
33
+ opts.separator ''
34
+ opts.separator 'Options'
35
+ opts.on(
36
+ '-m', '--minlen INT',
37
+ "Minimum fragment length to report. By default: #{o[:minlen]}"
38
+ ) { |v| o[:minlen] = v.to_i }
39
+ opts.on(
40
+ '-s', '--sorted', 'Keep fragments sorted as in the input file',
41
+ 'By default, fragments are shuffled'
42
+ ) { |v| o[:shuffle] = !v }
43
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
44
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
45
+ opts.separator ''
38
46
  end.parse!
39
- abort "-i is mandatory" if o[:in].nil?
40
- abort "-o is mandatory" if o[:out].nil?
41
- abort "-c is mandatory" if o[:completeness].nil?
47
+
48
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
49
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
50
+ raise Enveomics::OptionError.new('-c is mandatory') if o[:completeness].nil?
42
51
 
43
52
  begin
44
53
  # Read input sequences
45
54
  g_id = []
46
55
  g_seq = []
47
- File.open(o[:in], "r") do |ifh|
48
- id = ""
49
- ifh.each_line do |ln|
50
- if ln =~ /^>(\S*)/
51
- g_id << $1
52
- g_seq << ""
53
- else
54
- g_seq[g_seq.size-1] += ln.gsub(/[^A-Za-z]/,"")
55
- end
56
+ ifh = reader(o[:in])
57
+ id = ''
58
+ ifh.each_line do |ln|
59
+ if ln =~ /^>(\S*)/
60
+ g_id << $1
61
+ g_seq << ''
62
+ else
63
+ g_seq[g_seq.size - 1] += ln.gsub(/[^A-Za-z]/, '')
56
64
  end
57
65
  end
58
-
66
+ ifh.close
67
+
59
68
  # Fragment genomes
60
69
  f = {}
61
70
  binlen = [1, (o[:minlen].to_f/(1.5**2)).ceil].max
62
71
  p = [0.001, [1.0, 1.0 - (o[:completeness]/1.25 + 0.1)].min].max
63
- while not g_seq.empty?
72
+ while !g_seq.empty?
64
73
  id = g_id.shift
65
74
  seq = g_seq.shift
66
75
  gL = seq.length
67
- while not seq.empty?
68
- fL = [0, ((Enve::Stat.r_geom(p).to_f +
69
- Enve::Stat.r_unif(-0.5,0.5))*binlen).round].max
70
- f["#{f.size+1}_#{id}"] = seq[0,fL] if fL >= o[:minlen]
71
- seq = seq[(fL+1) .. -1]
72
- seq = "" if seq.nil?
76
+ while !seq.empty?
77
+ rand_x =
78
+ Enveomics::Stats.r_geom(p).to_f + Enveomics::Stats.r_unif(-0.5, 0.5)
79
+ fL = [0, (rand_x * binlen).round].max
80
+ f["#{f.size+1}_#{id}"] = seq[0, fL] if fL >= o[:minlen]
81
+ seq = seq[(fL + 1) .. -1]
82
+ seq = '' if seq.nil?
73
83
  end
74
84
  end
75
85
 
76
86
  # Save output
77
87
  k = f.keys
78
88
  k.shuffle! if o[:shuffle]
79
- File.open(o[:out], "w") do |ofh|
80
- k.each do |id|
81
- ofh.puts ">#{id}"
82
- ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
83
- end
89
+ ofh = writer(o[:out])
90
+ k.each do |id|
91
+ ofh.puts ">#{id}"
92
+ ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
84
93
  end
85
-
94
+ ofh.close
86
95
  rescue => err
87
- $stderr.puts "Exception: #{err}\n\n"
88
- err.backtrace.each { |l| $stderr.puts l + "\n" }
89
- err
96
+ $stderr.puts "Exception: #{err}\n\n"
97
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
98
+ err
90
99
  end
91
100
 
92
-
@@ -1,43 +1,57 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
- # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
- # @license Artistic-2.0
6
- #
3
+ # frozen_string_literal: false
7
4
 
8
- require 'optparse'
5
+ $VERSION = 1.0
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
9
8
 
10
- o = {q: false, rep: false}
11
- ARGV << '-h' if ARGV.size==0
9
+ o = { q: false, rep: false }
12
10
 
13
11
  OptionParser.new do |opt|
14
- opt.banner = "
15
- Samples a random set of sequences from a multi-FastA file.
16
-
17
- Usage: #{$0} [options]"
18
- opt.separator ''
12
+ Enveomics.opt_banner(
13
+ opt, 'Samples a random set of sequences from a multi-FastA file',
14
+ "#{File.basename($0)} -i seq.fa -o 10pc.fa -f 0.1 [options]"
15
+ )
19
16
  opt.separator 'Mandatory'
20
- opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
21
- opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
22
- opt.on('-f', '--fraction FLOAT',
17
+ opt.on(
18
+ '-i', '--in PATH',
19
+ 'Input FastA file',
20
+ 'Supports compression with .gz extension, use - for STDIN'
21
+ ) { |v| o[:i] = v }
22
+ opt.on(
23
+ '-o', '--out PATH',
24
+ 'Output FastA file',
25
+ 'Supports compression with .gz extension, use - for STDOUT'
26
+ ) { |v| o[:o] = v }
27
+ opt.on(
28
+ '-f', '--fraction FLOAT', Float,
23
29
  'Fraction of sequences to sample [0-1].',
24
- 'Mandatory unless -n is provided.'){ |v| o[:f] = v.to_f }
30
+ 'Mandatory unless -c is provided.'
31
+ ) { |v| o[:f] = v }
25
32
  opt.separator ''
33
+
26
34
  opt.separator 'Options'
27
- opt.on('-c', '--number INT',
28
- 'Number of sequences to sample.',
29
- 'Mandatory unless -f is provided.'){ |v| o[:n] = v.to_i }
30
- opt.on('-r', '--replacement','Sample with replacement'){ |v| o[:rep] = v }
31
- opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
35
+ opt.on(
36
+ '-c', '--number INT', Integer,
37
+ 'Number of sequences to sample',
38
+ 'Mandatory unless -f is provided'
39
+ ) { |v| o[:n] = v }
40
+ opt.on('-r', '--replacement','Sample with replacement') { |v| o[:rep] = v }
41
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
32
42
  opt.on('-h', '--help', 'Display this screen.') do
33
43
  puts opt
34
44
  exit
35
45
  end
36
46
  opt.separator ''
37
47
  end.parse!
38
- abort '-i is mandatory.' if o[:i].nil?
39
- abort '-o is mandatory.' if o[:o].nil?
40
- abort '-f or -n is mandatory.' if o[:f].nil? and o[:n].nil?
48
+
49
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
50
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:o].nil?
51
+ if o[:f].nil? && o[:n].nil?
52
+ raise Enveomics::OptionError.new('-f or -n is mandatory')
53
+ end
54
+ $QUIET = o[:q]
41
55
 
42
56
  # Functions to parse sequences
43
57
  def do_stuff(id, sq)
@@ -53,31 +67,32 @@ def do_stuff(id, sq)
53
67
  end
54
68
 
55
69
  # Parse sequences
56
- $stderr.puts 'Parsing sequences' unless o[:q]
70
+ say 'Parsing sequences'
57
71
  seq = []
58
- File.open(o[:i], 'r') do |fh|
59
- id = nil
60
- sq = ''
61
- fh.each do |ln|
62
- next if ln =~ /^;/
63
- if ln =~ /^>(.+)/
64
- seq << [id, sq] unless id.nil?
65
- id = $1
66
- sq = ''
67
- else
68
- sq << ln
69
- end
72
+ fh = reader(o[:i])
73
+ id = nil
74
+ sq = ''
75
+ fh.each do |ln|
76
+ next if ln =~ /^;/
77
+ if ln =~ /^>(.+)/
78
+ seq << [id, sq] unless id.nil?
79
+ id = $1
80
+ sq = ''
81
+ else
82
+ sq << ln
70
83
  end
71
- seq << [id, sq] unless id.nil?
72
84
  end
73
- $stderr.puts " Input sequences: #{seq.size}"
85
+ seq << [id, sq] unless id.nil?
86
+ fh.close
87
+ say "Input sequences: #{seq.size}"
88
+
74
89
  o[:n] ||= (seq.size * o[:f]).round
75
- seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
76
- File.open(o[:o], 'w') do |fh|
77
- seq_o.each do |i|
78
- fh.puts ">#{i[0]}"
79
- fh.puts i[1]
80
- end
90
+ seq_o = o[:rep] ? o[:n].times.map { seq.sample } : seq.sample(o[:n])
91
+ fh = writer(o[:o])
92
+ seq_o.each do |i|
93
+ fh.puts ">#{i[0]}"
94
+ fh.puts i[1]
81
95
  end
82
- $stderr.puts " Output sequences: #{seq_o.size}"
96
+ fh.close
97
+ say "Output sequences: #{seq_o.size}"
83
98
 
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'zlib'
5
+
6
+ o = { qual: 31, encoding: 33 }
7
+ ARGV << '-h' if ARGV.empty?
8
+ OptionParser.new do |opts|
9
+ opts.banner = "
10
+ Creates a FastQ-compliant file from a FastA file.
11
+
12
+ Usage: #{$0} [options]"
13
+ opts.separator ''
14
+ opts.separator 'Options'
15
+ opts.on(
16
+ '-i', '--in FILE', 'Input FastA file (supports .gz compression)'
17
+ ) { |v| o[:in] = v }
18
+ opts.on(
19
+ '-o', '--out FILE', 'Output FastQ file (supports .gz compression)'
20
+ ) { |v| o[:out] = v }
21
+ opts.on(
22
+ '-q', '--quality INT', Integer,
23
+ 'PHRED quality score to use (fixed), in the range [-5, 41]',
24
+ "By default: #{o[:qual]}"
25
+ ) { |v| o[:qual] = v }
26
+ opts.on(
27
+ '--encoding INT', Integer,
28
+ "Base encoding (33 or 64). By default: #{o[:encoding]}"
29
+ ) { |v| o[:encoding] = v }
30
+ opts.on('-h', '--help', 'Display this screen.') do
31
+ puts opts
32
+ exit
33
+ end
34
+ opts.separator ''
35
+ end.parse!
36
+ abort '-i is mandatory' if o[:in].nil?
37
+ abort '-o is mandatory' if o[:out].nil?
38
+ abort '-q must be in the range -5 .. 41' if o[:qual] < -5 || o[:qual] > 41
39
+
40
+ # Determine quality character
41
+ $qchar = (o[:qual] + o[:encoding]).chr
42
+
43
+ # Create file handlers
44
+ ifh = o[:in] =~ /\.gz$/ ?
45
+ Zlib::GzipReader.open(o[:in]) : File.open(o[:in], 'r')
46
+ ofh = o[:out] =~ /\.gz$/ ?
47
+ Zlib::GzipWriter.open(o[:out]) : File.open(o[:out], 'w')
48
+
49
+ def print_seq(ofh, id, seq)
50
+ ofh.puts "@#{id}", seq, '+', $qchar * seq.length unless seq.empty?
51
+ end
52
+
53
+ # Generate FastQ
54
+ id = ''
55
+ seq = ''
56
+ ifh.each_line do |ln|
57
+ next if ln =~ /^;/
58
+ if ln =~ /^>(.*)/
59
+ print_seq(ofh, id, seq)
60
+ seq = ''
61
+ id = $1
62
+ else
63
+ seq += ln.chomp.upcase.gsub(/[^A-Z]/,'')
64
+ end
65
+ end
66
+ print_seq(ofh, id, seq)
67
+ ofh.close
68
+ ifh.close
69
+
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $VERSION = 1.2
4
+ $:.push File.expand_path('../lib', __FILE__)
5
+ require 'enveomics_rb/enveomics'
6
+
7
+ o = { q: false, offset: 33, qual: 15, fasta: false }
8
+ OptionParser.new do |opts|
9
+ opts.version = $VERSION
10
+ Enveomics.opt_banner(
11
+ opts, 'Masks low-quality bases in a FastQ file',
12
+ "#{File.basename($0)} -i in.fastq -o out.fastq [options]"
13
+ )
14
+
15
+ opts.separator 'Mandatory'
16
+ opts.on(
17
+ '-i', '--input FILE',
18
+ 'Path to the FastQ file containing the sequences',
19
+ 'Supports compression with .gz extension, use - for STDIN'
20
+ ) { |v| o[:in] = v }
21
+ opts.on(
22
+ '-o', '--out FILE',
23
+ 'Path to the output FastQ file',
24
+ 'Supports compression with .gz extension, use - for STDOUT'
25
+ ) { |v| o[:out] = v }
26
+
27
+ opts.separator ''
28
+ opts.separator 'Quality Options'
29
+ opts.on(
30
+ '-q', '--qual INT', Integer,
31
+ "Minimum quality score to allow a base, by default: #{o[:qual]}"
32
+ ) { |v| o[:qual] = v }
33
+ opts.on(
34
+ '--offset INT', Integer,
35
+ "Q-score offset, by default: #{o[:offset]}"
36
+ ) { |v| o[:offset] = v }
37
+
38
+ opts.separator ''
39
+ opts.separator 'Other Options'
40
+ opts.on(
41
+ '-a', '--fasta', 'Output sequences in FastA format'
42
+ ) { |v| o[:fasta] = v }
43
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
44
+ opts.on('-h', '--help', 'Display this screen') do
45
+ puts opts
46
+ exit
47
+ end
48
+ opts.separator ''
49
+ end.parse!
50
+
51
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
52
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
53
+ $QUIET = o[:q]
54
+
55
+ # Open in/out files
56
+ say 'Reading FastQ file'
57
+ ifh = reader(o[:in])
58
+ ofh = writer(o[:out])
59
+
60
+ # Parse and mask
61
+ entry = []
62
+ lno = 0
63
+ ifh.each_line do |ln|
64
+ lno += 1 # <- Gzip doesn't support $.
65
+ case lno % 4
66
+ when 1
67
+ ln =~ /^@(\S+)/ or
68
+ raise Enveomics::ParseError.new("Unexpected defline format: #{ln}")
69
+ entry << ln
70
+ when 2, 3
71
+ entry << ln
72
+ when 0
73
+ entry << ln
74
+ q = entry[3].chomp.split('').map { |i| (i.ord - o[:offset]) }
75
+ q.map { |i| i < o[:qual] }.each_with_index { |i, k| entry[1][k] = 'N' if i }
76
+ ofh.puts(o[:fasta] ? [entry[0].gsub(/^@/, '>'), entry[1]] : entry)
77
+ entry = []
78
+ end
79
+ end
80
+
81
+ # Finalize
82
+ say " Lines: #{lno}"
83
+ unless entry.empty?
84
+ raise Enveomics::ParseError.new('Unexpected trailing lines in FastQ')
85
+ end
86
+ say " Sequences: #{lno / 4}"
87
+ ifh.close
88
+ ofh.close
89
+