miga-base 0.7.26.3 → 1.0.0.sr1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/doctor.rb +50 -19
- data/lib/miga/cli/action/doctor/base.rb +20 -18
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +1 -2
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +11 -6
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +7 -0
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +33 -6
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -1,92 +1,100 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
|
-
# @license artistic license 2.0
|
3
|
+
# frozen_string_literal: true
|
5
4
|
|
6
|
-
$:.push File.expand_path(
|
7
|
-
require
|
8
|
-
require
|
5
|
+
$:.push File.expand_path('../lib', __FILE__)
|
6
|
+
require 'enveomics_rb/enveomics'
|
7
|
+
require 'enveomics_rb/stats'
|
8
|
+
$VERSION = 1.0
|
9
9
|
|
10
|
-
o = {q:false, completeness:nil, minlen:500, shuffle:true}
|
10
|
+
o = { q: false, completeness: nil, minlen: 500, shuffle: true }
|
11
11
|
OptionParser.new do |opts|
|
12
|
-
opts.
|
13
|
-
|
12
|
+
opts.version = $VERSION
|
13
|
+
Enveomics.opt_banner(
|
14
|
+
opts, 'Simulates incomplete (fragmented) drafts from complete genomes',
|
15
|
+
"#{File.basename($0)} -i in.fasta -o out.fasta -c 0.5 [options]"
|
16
|
+
)
|
14
17
|
|
15
|
-
|
16
|
-
opts.
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
opts.on(
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
opts.
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
opts.
|
31
|
-
|
32
|
-
opts.on(
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
18
|
+
opts.separator 'Mandatory'
|
19
|
+
opts.on(
|
20
|
+
'-i', '--in FILE',
|
21
|
+
'Path to the FastA file containing the complete sequences',
|
22
|
+
'Supports compression with .gz extension, use - for STDIN'
|
23
|
+
) { |v| o[:in] = v }
|
24
|
+
opts.on(
|
25
|
+
'-o', '--out FILE', 'Path to the FastA to create',
|
26
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
27
|
+
) { |v| o[:out] = v }
|
28
|
+
opts.on(
|
29
|
+
'-c', '--completeness FLOAT',
|
30
|
+
'Fraction of genome completeness to simulate from 0 to 1'
|
31
|
+
) { |v| o[:completeness] = v.to_f }
|
32
|
+
|
33
|
+
opts.separator ''
|
34
|
+
opts.separator 'Options'
|
35
|
+
opts.on(
|
36
|
+
'-m', '--minlen INT',
|
37
|
+
"Minimum fragment length to report. By default: #{o[:minlen]}"
|
38
|
+
) { |v| o[:minlen] = v.to_i }
|
39
|
+
opts.on(
|
40
|
+
'-s', '--sorted', 'Keep fragments sorted as in the input file',
|
41
|
+
'By default, fragments are shuffled'
|
42
|
+
) { |v| o[:shuffle] = !v }
|
43
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
44
|
+
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
45
|
+
opts.separator ''
|
38
46
|
end.parse!
|
39
|
-
|
40
|
-
|
41
|
-
|
47
|
+
|
48
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
49
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
50
|
+
raise Enveomics::OptionError.new('-c is mandatory') if o[:completeness].nil?
|
42
51
|
|
43
52
|
begin
|
44
53
|
# Read input sequences
|
45
54
|
g_id = []
|
46
55
|
g_seq = []
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
56
|
+
ifh = reader(o[:in])
|
57
|
+
id = ''
|
58
|
+
ifh.each_line do |ln|
|
59
|
+
if ln =~ /^>(\S*)/
|
60
|
+
g_id << $1
|
61
|
+
g_seq << ''
|
62
|
+
else
|
63
|
+
g_seq[g_seq.size - 1] += ln.gsub(/[^A-Za-z]/, '')
|
56
64
|
end
|
57
65
|
end
|
58
|
-
|
66
|
+
ifh.close
|
67
|
+
|
59
68
|
# Fragment genomes
|
60
69
|
f = {}
|
61
70
|
binlen = [1, (o[:minlen].to_f/(1.5**2)).ceil].max
|
62
71
|
p = [0.001, [1.0, 1.0 - (o[:completeness]/1.25 + 0.1)].min].max
|
63
|
-
while
|
72
|
+
while !g_seq.empty?
|
64
73
|
id = g_id.shift
|
65
74
|
seq = g_seq.shift
|
66
75
|
gL = seq.length
|
67
|
-
while
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
seq =
|
76
|
+
while !seq.empty?
|
77
|
+
rand_x =
|
78
|
+
Enveomics::Stats.r_geom(p).to_f + Enveomics::Stats.r_unif(-0.5, 0.5)
|
79
|
+
fL = [0, (rand_x * binlen).round].max
|
80
|
+
f["#{f.size+1}_#{id}"] = seq[0, fL] if fL >= o[:minlen]
|
81
|
+
seq = seq[(fL + 1) .. -1]
|
82
|
+
seq = '' if seq.nil?
|
73
83
|
end
|
74
84
|
end
|
75
85
|
|
76
86
|
# Save output
|
77
87
|
k = f.keys
|
78
88
|
k.shuffle! if o[:shuffle]
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
end
|
89
|
+
ofh = writer(o[:out])
|
90
|
+
k.each do |id|
|
91
|
+
ofh.puts ">#{id}"
|
92
|
+
ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
|
84
93
|
end
|
85
|
-
|
94
|
+
ofh.close
|
86
95
|
rescue => err
|
87
|
-
|
88
|
-
|
89
|
-
|
96
|
+
$stderr.puts "Exception: #{err}\n\n"
|
97
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
98
|
+
err
|
90
99
|
end
|
91
100
|
|
92
|
-
|
@@ -1,43 +1,57 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
|
-
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
5
|
-
# @license Artistic-2.0
|
6
|
-
#
|
3
|
+
# frozen_string_literal: false
|
7
4
|
|
8
|
-
|
5
|
+
$VERSION = 1.0
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
7
|
+
require 'enveomics_rb/enveomics'
|
9
8
|
|
10
|
-
o = {q: false, rep: false}
|
11
|
-
ARGV << '-h' if ARGV.size==0
|
9
|
+
o = { q: false, rep: false }
|
12
10
|
|
13
11
|
OptionParser.new do |opt|
|
14
|
-
|
15
|
-
Samples a random set of sequences from a multi-FastA file
|
16
|
-
|
17
|
-
|
18
|
-
opt.separator ''
|
12
|
+
Enveomics.opt_banner(
|
13
|
+
opt, 'Samples a random set of sequences from a multi-FastA file',
|
14
|
+
"#{File.basename($0)} -i seq.fa -o 10pc.fa -f 0.1 [options]"
|
15
|
+
)
|
19
16
|
opt.separator 'Mandatory'
|
20
|
-
opt.on(
|
21
|
-
|
22
|
-
|
17
|
+
opt.on(
|
18
|
+
'-i', '--in PATH',
|
19
|
+
'Input FastA file',
|
20
|
+
'Supports compression with .gz extension, use - for STDIN'
|
21
|
+
) { |v| o[:i] = v }
|
22
|
+
opt.on(
|
23
|
+
'-o', '--out PATH',
|
24
|
+
'Output FastA file',
|
25
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
26
|
+
) { |v| o[:o] = v }
|
27
|
+
opt.on(
|
28
|
+
'-f', '--fraction FLOAT', Float,
|
23
29
|
'Fraction of sequences to sample [0-1].',
|
24
|
-
'Mandatory unless -
|
30
|
+
'Mandatory unless -c is provided.'
|
31
|
+
) { |v| o[:f] = v }
|
25
32
|
opt.separator ''
|
33
|
+
|
26
34
|
opt.separator 'Options'
|
27
|
-
opt.on(
|
28
|
-
'
|
29
|
-
'
|
30
|
-
|
31
|
-
|
35
|
+
opt.on(
|
36
|
+
'-c', '--number INT', Integer,
|
37
|
+
'Number of sequences to sample',
|
38
|
+
'Mandatory unless -f is provided'
|
39
|
+
) { |v| o[:n] = v }
|
40
|
+
opt.on('-r', '--replacement','Sample with replacement') { |v| o[:rep] = v }
|
41
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
32
42
|
opt.on('-h', '--help', 'Display this screen.') do
|
33
43
|
puts opt
|
34
44
|
exit
|
35
45
|
end
|
36
46
|
opt.separator ''
|
37
47
|
end.parse!
|
38
|
-
|
39
|
-
|
40
|
-
|
48
|
+
|
49
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
|
50
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:o].nil?
|
51
|
+
if o[:f].nil? && o[:n].nil?
|
52
|
+
raise Enveomics::OptionError.new('-f or -n is mandatory')
|
53
|
+
end
|
54
|
+
$QUIET = o[:q]
|
41
55
|
|
42
56
|
# Functions to parse sequences
|
43
57
|
def do_stuff(id, sq)
|
@@ -53,31 +67,32 @@ def do_stuff(id, sq)
|
|
53
67
|
end
|
54
68
|
|
55
69
|
# Parse sequences
|
56
|
-
|
70
|
+
say 'Parsing sequences'
|
57
71
|
seq = []
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
end
|
72
|
+
fh = reader(o[:i])
|
73
|
+
id = nil
|
74
|
+
sq = ''
|
75
|
+
fh.each do |ln|
|
76
|
+
next if ln =~ /^;/
|
77
|
+
if ln =~ /^>(.+)/
|
78
|
+
seq << [id, sq] unless id.nil?
|
79
|
+
id = $1
|
80
|
+
sq = ''
|
81
|
+
else
|
82
|
+
sq << ln
|
70
83
|
end
|
71
|
-
seq << [id, sq] unless id.nil?
|
72
84
|
end
|
73
|
-
|
85
|
+
seq << [id, sq] unless id.nil?
|
86
|
+
fh.close
|
87
|
+
say "Input sequences: #{seq.size}"
|
88
|
+
|
74
89
|
o[:n] ||= (seq.size * o[:f]).round
|
75
|
-
seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
end
|
90
|
+
seq_o = o[:rep] ? o[:n].times.map { seq.sample } : seq.sample(o[:n])
|
91
|
+
fh = writer(o[:o])
|
92
|
+
seq_o.each do |i|
|
93
|
+
fh.puts ">#{i[0]}"
|
94
|
+
fh.puts i[1]
|
81
95
|
end
|
82
|
-
|
96
|
+
fh.close
|
97
|
+
say "Output sequences: #{seq_o.size}"
|
83
98
|
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
o = { qual: 31, encoding: 33 }
|
7
|
+
ARGV << '-h' if ARGV.empty?
|
8
|
+
OptionParser.new do |opts|
|
9
|
+
opts.banner = "
|
10
|
+
Creates a FastQ-compliant file from a FastA file.
|
11
|
+
|
12
|
+
Usage: #{$0} [options]"
|
13
|
+
opts.separator ''
|
14
|
+
opts.separator 'Options'
|
15
|
+
opts.on(
|
16
|
+
'-i', '--in FILE', 'Input FastA file (supports .gz compression)'
|
17
|
+
) { |v| o[:in] = v }
|
18
|
+
opts.on(
|
19
|
+
'-o', '--out FILE', 'Output FastQ file (supports .gz compression)'
|
20
|
+
) { |v| o[:out] = v }
|
21
|
+
opts.on(
|
22
|
+
'-q', '--quality INT', Integer,
|
23
|
+
'PHRED quality score to use (fixed), in the range [-5, 41]',
|
24
|
+
"By default: #{o[:qual]}"
|
25
|
+
) { |v| o[:qual] = v }
|
26
|
+
opts.on(
|
27
|
+
'--encoding INT', Integer,
|
28
|
+
"Base encoding (33 or 64). By default: #{o[:encoding]}"
|
29
|
+
) { |v| o[:encoding] = v }
|
30
|
+
opts.on('-h', '--help', 'Display this screen.') do
|
31
|
+
puts opts
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
opts.separator ''
|
35
|
+
end.parse!
|
36
|
+
abort '-i is mandatory' if o[:in].nil?
|
37
|
+
abort '-o is mandatory' if o[:out].nil?
|
38
|
+
abort '-q must be in the range -5 .. 41' if o[:qual] < -5 || o[:qual] > 41
|
39
|
+
|
40
|
+
# Determine quality character
|
41
|
+
$qchar = (o[:qual] + o[:encoding]).chr
|
42
|
+
|
43
|
+
# Create file handlers
|
44
|
+
ifh = o[:in] =~ /\.gz$/ ?
|
45
|
+
Zlib::GzipReader.open(o[:in]) : File.open(o[:in], 'r')
|
46
|
+
ofh = o[:out] =~ /\.gz$/ ?
|
47
|
+
Zlib::GzipWriter.open(o[:out]) : File.open(o[:out], 'w')
|
48
|
+
|
49
|
+
def print_seq(ofh, id, seq)
|
50
|
+
ofh.puts "@#{id}", seq, '+', $qchar * seq.length unless seq.empty?
|
51
|
+
end
|
52
|
+
|
53
|
+
# Generate FastQ
|
54
|
+
id = ''
|
55
|
+
seq = ''
|
56
|
+
ifh.each_line do |ln|
|
57
|
+
next if ln =~ /^;/
|
58
|
+
if ln =~ /^>(.*)/
|
59
|
+
print_seq(ofh, id, seq)
|
60
|
+
seq = ''
|
61
|
+
id = $1
|
62
|
+
else
|
63
|
+
seq += ln.chomp.upcase.gsub(/[^A-Z]/,'')
|
64
|
+
end
|
65
|
+
end
|
66
|
+
print_seq(ofh, id, seq)
|
67
|
+
ofh.close
|
68
|
+
ifh.close
|
69
|
+
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$VERSION = 1.2
|
4
|
+
$:.push File.expand_path('../lib', __FILE__)
|
5
|
+
require 'enveomics_rb/enveomics'
|
6
|
+
|
7
|
+
o = { q: false, offset: 33, qual: 15, fasta: false }
|
8
|
+
OptionParser.new do |opts|
|
9
|
+
opts.version = $VERSION
|
10
|
+
Enveomics.opt_banner(
|
11
|
+
opts, 'Masks low-quality bases in a FastQ file',
|
12
|
+
"#{File.basename($0)} -i in.fastq -o out.fastq [options]"
|
13
|
+
)
|
14
|
+
|
15
|
+
opts.separator 'Mandatory'
|
16
|
+
opts.on(
|
17
|
+
'-i', '--input FILE',
|
18
|
+
'Path to the FastQ file containing the sequences',
|
19
|
+
'Supports compression with .gz extension, use - for STDIN'
|
20
|
+
) { |v| o[:in] = v }
|
21
|
+
opts.on(
|
22
|
+
'-o', '--out FILE',
|
23
|
+
'Path to the output FastQ file',
|
24
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
25
|
+
) { |v| o[:out] = v }
|
26
|
+
|
27
|
+
opts.separator ''
|
28
|
+
opts.separator 'Quality Options'
|
29
|
+
opts.on(
|
30
|
+
'-q', '--qual INT', Integer,
|
31
|
+
"Minimum quality score to allow a base, by default: #{o[:qual]}"
|
32
|
+
) { |v| o[:qual] = v }
|
33
|
+
opts.on(
|
34
|
+
'--offset INT', Integer,
|
35
|
+
"Q-score offset, by default: #{o[:offset]}"
|
36
|
+
) { |v| o[:offset] = v }
|
37
|
+
|
38
|
+
opts.separator ''
|
39
|
+
opts.separator 'Other Options'
|
40
|
+
opts.on(
|
41
|
+
'-a', '--fasta', 'Output sequences in FastA format'
|
42
|
+
) { |v| o[:fasta] = v }
|
43
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
44
|
+
opts.on('-h', '--help', 'Display this screen') do
|
45
|
+
puts opts
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
opts.separator ''
|
49
|
+
end.parse!
|
50
|
+
|
51
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
52
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
53
|
+
$QUIET = o[:q]
|
54
|
+
|
55
|
+
# Open in/out files
|
56
|
+
say 'Reading FastQ file'
|
57
|
+
ifh = reader(o[:in])
|
58
|
+
ofh = writer(o[:out])
|
59
|
+
|
60
|
+
# Parse and mask
|
61
|
+
entry = []
|
62
|
+
lno = 0
|
63
|
+
ifh.each_line do |ln|
|
64
|
+
lno += 1 # <- Gzip doesn't support $.
|
65
|
+
case lno % 4
|
66
|
+
when 1
|
67
|
+
ln =~ /^@(\S+)/ or
|
68
|
+
raise Enveomics::ParseError.new("Unexpected defline format: #{ln}")
|
69
|
+
entry << ln
|
70
|
+
when 2, 3
|
71
|
+
entry << ln
|
72
|
+
when 0
|
73
|
+
entry << ln
|
74
|
+
q = entry[3].chomp.split('').map { |i| (i.ord - o[:offset]) }
|
75
|
+
q.map { |i| i < o[:qual] }.each_with_index { |i, k| entry[1][k] = 'N' if i }
|
76
|
+
ofh.puts(o[:fasta] ? [entry[0].gsub(/^@/, '>'), entry[1]] : entry)
|
77
|
+
entry = []
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Finalize
|
82
|
+
say " Lines: #{lno}"
|
83
|
+
unless entry.empty?
|
84
|
+
raise Enveomics::ParseError.new('Unexpected trailing lines in FastQ')
|
85
|
+
end
|
86
|
+
say " Sequences: #{lno / 4}"
|
87
|
+
ifh.close
|
88
|
+
ofh.close
|
89
|
+
|