miga-base 0.7.26.3 → 1.0.0.sr1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/doctor.rb +50 -19
- data/lib/miga/cli/action/doctor/base.rb +20 -18
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +1 -2
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +11 -6
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +7 -0
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +33 -6
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
|
@@ -1,92 +1,100 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
|
-
#
|
|
4
|
-
# @license artistic license 2.0
|
|
3
|
+
# frozen_string_literal: true
|
|
5
4
|
|
|
6
|
-
$:.push File.expand_path(
|
|
7
|
-
require
|
|
8
|
-
require
|
|
5
|
+
$:.push File.expand_path('../lib', __FILE__)
|
|
6
|
+
require 'enveomics_rb/enveomics'
|
|
7
|
+
require 'enveomics_rb/stats'
|
|
8
|
+
$VERSION = 1.0
|
|
9
9
|
|
|
10
|
-
o = {q:false, completeness:nil, minlen:500, shuffle:true}
|
|
10
|
+
o = { q: false, completeness: nil, minlen: 500, shuffle: true }
|
|
11
11
|
OptionParser.new do |opts|
|
|
12
|
-
opts.
|
|
13
|
-
|
|
12
|
+
opts.version = $VERSION
|
|
13
|
+
Enveomics.opt_banner(
|
|
14
|
+
opts, 'Simulates incomplete (fragmented) drafts from complete genomes',
|
|
15
|
+
"#{File.basename($0)} -i in.fasta -o out.fasta -c 0.5 [options]"
|
|
16
|
+
)
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
opts.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
opts.on(
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
opts.
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
opts.
|
|
31
|
-
|
|
32
|
-
opts.on(
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
18
|
+
opts.separator 'Mandatory'
|
|
19
|
+
opts.on(
|
|
20
|
+
'-i', '--in FILE',
|
|
21
|
+
'Path to the FastA file containing the complete sequences',
|
|
22
|
+
'Supports compression with .gz extension, use - for STDIN'
|
|
23
|
+
) { |v| o[:in] = v }
|
|
24
|
+
opts.on(
|
|
25
|
+
'-o', '--out FILE', 'Path to the FastA to create',
|
|
26
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
|
27
|
+
) { |v| o[:out] = v }
|
|
28
|
+
opts.on(
|
|
29
|
+
'-c', '--completeness FLOAT',
|
|
30
|
+
'Fraction of genome completeness to simulate from 0 to 1'
|
|
31
|
+
) { |v| o[:completeness] = v.to_f }
|
|
32
|
+
|
|
33
|
+
opts.separator ''
|
|
34
|
+
opts.separator 'Options'
|
|
35
|
+
opts.on(
|
|
36
|
+
'-m', '--minlen INT',
|
|
37
|
+
"Minimum fragment length to report. By default: #{o[:minlen]}"
|
|
38
|
+
) { |v| o[:minlen] = v.to_i }
|
|
39
|
+
opts.on(
|
|
40
|
+
'-s', '--sorted', 'Keep fragments sorted as in the input file',
|
|
41
|
+
'By default, fragments are shuffled'
|
|
42
|
+
) { |v| o[:shuffle] = !v }
|
|
43
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
|
44
|
+
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
|
45
|
+
opts.separator ''
|
|
38
46
|
end.parse!
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
47
|
+
|
|
48
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
|
49
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
|
50
|
+
raise Enveomics::OptionError.new('-c is mandatory') if o[:completeness].nil?
|
|
42
51
|
|
|
43
52
|
begin
|
|
44
53
|
# Read input sequences
|
|
45
54
|
g_id = []
|
|
46
55
|
g_seq = []
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
end
|
|
56
|
+
ifh = reader(o[:in])
|
|
57
|
+
id = ''
|
|
58
|
+
ifh.each_line do |ln|
|
|
59
|
+
if ln =~ /^>(\S*)/
|
|
60
|
+
g_id << $1
|
|
61
|
+
g_seq << ''
|
|
62
|
+
else
|
|
63
|
+
g_seq[g_seq.size - 1] += ln.gsub(/[^A-Za-z]/, '')
|
|
56
64
|
end
|
|
57
65
|
end
|
|
58
|
-
|
|
66
|
+
ifh.close
|
|
67
|
+
|
|
59
68
|
# Fragment genomes
|
|
60
69
|
f = {}
|
|
61
70
|
binlen = [1, (o[:minlen].to_f/(1.5**2)).ceil].max
|
|
62
71
|
p = [0.001, [1.0, 1.0 - (o[:completeness]/1.25 + 0.1)].min].max
|
|
63
|
-
while
|
|
72
|
+
while !g_seq.empty?
|
|
64
73
|
id = g_id.shift
|
|
65
74
|
seq = g_seq.shift
|
|
66
75
|
gL = seq.length
|
|
67
|
-
while
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
seq =
|
|
76
|
+
while !seq.empty?
|
|
77
|
+
rand_x =
|
|
78
|
+
Enveomics::Stats.r_geom(p).to_f + Enveomics::Stats.r_unif(-0.5, 0.5)
|
|
79
|
+
fL = [0, (rand_x * binlen).round].max
|
|
80
|
+
f["#{f.size+1}_#{id}"] = seq[0, fL] if fL >= o[:minlen]
|
|
81
|
+
seq = seq[(fL + 1) .. -1]
|
|
82
|
+
seq = '' if seq.nil?
|
|
73
83
|
end
|
|
74
84
|
end
|
|
75
85
|
|
|
76
86
|
# Save output
|
|
77
87
|
k = f.keys
|
|
78
88
|
k.shuffle! if o[:shuffle]
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
end
|
|
89
|
+
ofh = writer(o[:out])
|
|
90
|
+
k.each do |id|
|
|
91
|
+
ofh.puts ">#{id}"
|
|
92
|
+
ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
|
|
84
93
|
end
|
|
85
|
-
|
|
94
|
+
ofh.close
|
|
86
95
|
rescue => err
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
96
|
+
$stderr.puts "Exception: #{err}\n\n"
|
|
97
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
|
98
|
+
err
|
|
90
99
|
end
|
|
91
100
|
|
|
92
|
-
|
|
@@ -1,43 +1,57 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
|
-
#
|
|
4
|
-
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
|
5
|
-
# @license Artistic-2.0
|
|
6
|
-
#
|
|
3
|
+
# frozen_string_literal: false
|
|
7
4
|
|
|
8
|
-
|
|
5
|
+
$VERSION = 1.0
|
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
|
7
|
+
require 'enveomics_rb/enveomics'
|
|
9
8
|
|
|
10
|
-
o = {q: false, rep: false}
|
|
11
|
-
ARGV << '-h' if ARGV.size==0
|
|
9
|
+
o = { q: false, rep: false }
|
|
12
10
|
|
|
13
11
|
OptionParser.new do |opt|
|
|
14
|
-
|
|
15
|
-
Samples a random set of sequences from a multi-FastA file
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
opt.separator ''
|
|
12
|
+
Enveomics.opt_banner(
|
|
13
|
+
opt, 'Samples a random set of sequences from a multi-FastA file',
|
|
14
|
+
"#{File.basename($0)} -i seq.fa -o 10pc.fa -f 0.1 [options]"
|
|
15
|
+
)
|
|
19
16
|
opt.separator 'Mandatory'
|
|
20
|
-
opt.on(
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
opt.on(
|
|
18
|
+
'-i', '--in PATH',
|
|
19
|
+
'Input FastA file',
|
|
20
|
+
'Supports compression with .gz extension, use - for STDIN'
|
|
21
|
+
) { |v| o[:i] = v }
|
|
22
|
+
opt.on(
|
|
23
|
+
'-o', '--out PATH',
|
|
24
|
+
'Output FastA file',
|
|
25
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
|
26
|
+
) { |v| o[:o] = v }
|
|
27
|
+
opt.on(
|
|
28
|
+
'-f', '--fraction FLOAT', Float,
|
|
23
29
|
'Fraction of sequences to sample [0-1].',
|
|
24
|
-
'Mandatory unless -
|
|
30
|
+
'Mandatory unless -c is provided.'
|
|
31
|
+
) { |v| o[:f] = v }
|
|
25
32
|
opt.separator ''
|
|
33
|
+
|
|
26
34
|
opt.separator 'Options'
|
|
27
|
-
opt.on(
|
|
28
|
-
'
|
|
29
|
-
'
|
|
30
|
-
|
|
31
|
-
|
|
35
|
+
opt.on(
|
|
36
|
+
'-c', '--number INT', Integer,
|
|
37
|
+
'Number of sequences to sample',
|
|
38
|
+
'Mandatory unless -f is provided'
|
|
39
|
+
) { |v| o[:n] = v }
|
|
40
|
+
opt.on('-r', '--replacement','Sample with replacement') { |v| o[:rep] = v }
|
|
41
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
|
32
42
|
opt.on('-h', '--help', 'Display this screen.') do
|
|
33
43
|
puts opt
|
|
34
44
|
exit
|
|
35
45
|
end
|
|
36
46
|
opt.separator ''
|
|
37
47
|
end.parse!
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
48
|
+
|
|
49
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
|
|
50
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:o].nil?
|
|
51
|
+
if o[:f].nil? && o[:n].nil?
|
|
52
|
+
raise Enveomics::OptionError.new('-f or -n is mandatory')
|
|
53
|
+
end
|
|
54
|
+
$QUIET = o[:q]
|
|
41
55
|
|
|
42
56
|
# Functions to parse sequences
|
|
43
57
|
def do_stuff(id, sq)
|
|
@@ -53,31 +67,32 @@ def do_stuff(id, sq)
|
|
|
53
67
|
end
|
|
54
68
|
|
|
55
69
|
# Parse sequences
|
|
56
|
-
|
|
70
|
+
say 'Parsing sequences'
|
|
57
71
|
seq = []
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
end
|
|
72
|
+
fh = reader(o[:i])
|
|
73
|
+
id = nil
|
|
74
|
+
sq = ''
|
|
75
|
+
fh.each do |ln|
|
|
76
|
+
next if ln =~ /^;/
|
|
77
|
+
if ln =~ /^>(.+)/
|
|
78
|
+
seq << [id, sq] unless id.nil?
|
|
79
|
+
id = $1
|
|
80
|
+
sq = ''
|
|
81
|
+
else
|
|
82
|
+
sq << ln
|
|
70
83
|
end
|
|
71
|
-
seq << [id, sq] unless id.nil?
|
|
72
84
|
end
|
|
73
|
-
|
|
85
|
+
seq << [id, sq] unless id.nil?
|
|
86
|
+
fh.close
|
|
87
|
+
say "Input sequences: #{seq.size}"
|
|
88
|
+
|
|
74
89
|
o[:n] ||= (seq.size * o[:f]).round
|
|
75
|
-
seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
end
|
|
90
|
+
seq_o = o[:rep] ? o[:n].times.map { seq.sample } : seq.sample(o[:n])
|
|
91
|
+
fh = writer(o[:o])
|
|
92
|
+
seq_o.each do |i|
|
|
93
|
+
fh.puts ">#{i[0]}"
|
|
94
|
+
fh.puts i[1]
|
|
81
95
|
end
|
|
82
|
-
|
|
96
|
+
fh.close
|
|
97
|
+
say "Output sequences: #{seq_o.size}"
|
|
83
98
|
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
require 'zlib'
|
|
5
|
+
|
|
6
|
+
o = { qual: 31, encoding: 33 }
|
|
7
|
+
ARGV << '-h' if ARGV.empty?
|
|
8
|
+
OptionParser.new do |opts|
|
|
9
|
+
opts.banner = "
|
|
10
|
+
Creates a FastQ-compliant file from a FastA file.
|
|
11
|
+
|
|
12
|
+
Usage: #{$0} [options]"
|
|
13
|
+
opts.separator ''
|
|
14
|
+
opts.separator 'Options'
|
|
15
|
+
opts.on(
|
|
16
|
+
'-i', '--in FILE', 'Input FastA file (supports .gz compression)'
|
|
17
|
+
) { |v| o[:in] = v }
|
|
18
|
+
opts.on(
|
|
19
|
+
'-o', '--out FILE', 'Output FastQ file (supports .gz compression)'
|
|
20
|
+
) { |v| o[:out] = v }
|
|
21
|
+
opts.on(
|
|
22
|
+
'-q', '--quality INT', Integer,
|
|
23
|
+
'PHRED quality score to use (fixed), in the range [-5, 41]',
|
|
24
|
+
"By default: #{o[:qual]}"
|
|
25
|
+
) { |v| o[:qual] = v }
|
|
26
|
+
opts.on(
|
|
27
|
+
'--encoding INT', Integer,
|
|
28
|
+
"Base encoding (33 or 64). By default: #{o[:encoding]}"
|
|
29
|
+
) { |v| o[:encoding] = v }
|
|
30
|
+
opts.on('-h', '--help', 'Display this screen.') do
|
|
31
|
+
puts opts
|
|
32
|
+
exit
|
|
33
|
+
end
|
|
34
|
+
opts.separator ''
|
|
35
|
+
end.parse!
|
|
36
|
+
abort '-i is mandatory' if o[:in].nil?
|
|
37
|
+
abort '-o is mandatory' if o[:out].nil?
|
|
38
|
+
abort '-q must be in the range -5 .. 41' if o[:qual] < -5 || o[:qual] > 41
|
|
39
|
+
|
|
40
|
+
# Determine quality character
|
|
41
|
+
$qchar = (o[:qual] + o[:encoding]).chr
|
|
42
|
+
|
|
43
|
+
# Create file handlers
|
|
44
|
+
ifh = o[:in] =~ /\.gz$/ ?
|
|
45
|
+
Zlib::GzipReader.open(o[:in]) : File.open(o[:in], 'r')
|
|
46
|
+
ofh = o[:out] =~ /\.gz$/ ?
|
|
47
|
+
Zlib::GzipWriter.open(o[:out]) : File.open(o[:out], 'w')
|
|
48
|
+
|
|
49
|
+
def print_seq(ofh, id, seq)
|
|
50
|
+
ofh.puts "@#{id}", seq, '+', $qchar * seq.length unless seq.empty?
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Generate FastQ
|
|
54
|
+
id = ''
|
|
55
|
+
seq = ''
|
|
56
|
+
ifh.each_line do |ln|
|
|
57
|
+
next if ln =~ /^;/
|
|
58
|
+
if ln =~ /^>(.*)/
|
|
59
|
+
print_seq(ofh, id, seq)
|
|
60
|
+
seq = ''
|
|
61
|
+
id = $1
|
|
62
|
+
else
|
|
63
|
+
seq += ln.chomp.upcase.gsub(/[^A-Z]/,'')
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
print_seq(ofh, id, seq)
|
|
67
|
+
ofh.close
|
|
68
|
+
ifh.close
|
|
69
|
+
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$VERSION = 1.2
|
|
4
|
+
$:.push File.expand_path('../lib', __FILE__)
|
|
5
|
+
require 'enveomics_rb/enveomics'
|
|
6
|
+
|
|
7
|
+
o = { q: false, offset: 33, qual: 15, fasta: false }
|
|
8
|
+
OptionParser.new do |opts|
|
|
9
|
+
opts.version = $VERSION
|
|
10
|
+
Enveomics.opt_banner(
|
|
11
|
+
opts, 'Masks low-quality bases in a FastQ file',
|
|
12
|
+
"#{File.basename($0)} -i in.fastq -o out.fastq [options]"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
opts.separator 'Mandatory'
|
|
16
|
+
opts.on(
|
|
17
|
+
'-i', '--input FILE',
|
|
18
|
+
'Path to the FastQ file containing the sequences',
|
|
19
|
+
'Supports compression with .gz extension, use - for STDIN'
|
|
20
|
+
) { |v| o[:in] = v }
|
|
21
|
+
opts.on(
|
|
22
|
+
'-o', '--out FILE',
|
|
23
|
+
'Path to the output FastQ file',
|
|
24
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
|
25
|
+
) { |v| o[:out] = v }
|
|
26
|
+
|
|
27
|
+
opts.separator ''
|
|
28
|
+
opts.separator 'Quality Options'
|
|
29
|
+
opts.on(
|
|
30
|
+
'-q', '--qual INT', Integer,
|
|
31
|
+
"Minimum quality score to allow a base, by default: #{o[:qual]}"
|
|
32
|
+
) { |v| o[:qual] = v }
|
|
33
|
+
opts.on(
|
|
34
|
+
'--offset INT', Integer,
|
|
35
|
+
"Q-score offset, by default: #{o[:offset]}"
|
|
36
|
+
) { |v| o[:offset] = v }
|
|
37
|
+
|
|
38
|
+
opts.separator ''
|
|
39
|
+
opts.separator 'Other Options'
|
|
40
|
+
opts.on(
|
|
41
|
+
'-a', '--fasta', 'Output sequences in FastA format'
|
|
42
|
+
) { |v| o[:fasta] = v }
|
|
43
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
|
44
|
+
opts.on('-h', '--help', 'Display this screen') do
|
|
45
|
+
puts opts
|
|
46
|
+
exit
|
|
47
|
+
end
|
|
48
|
+
opts.separator ''
|
|
49
|
+
end.parse!
|
|
50
|
+
|
|
51
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
|
52
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
|
53
|
+
$QUIET = o[:q]
|
|
54
|
+
|
|
55
|
+
# Open in/out files
|
|
56
|
+
say 'Reading FastQ file'
|
|
57
|
+
ifh = reader(o[:in])
|
|
58
|
+
ofh = writer(o[:out])
|
|
59
|
+
|
|
60
|
+
# Parse and mask
|
|
61
|
+
entry = []
|
|
62
|
+
lno = 0
|
|
63
|
+
ifh.each_line do |ln|
|
|
64
|
+
lno += 1 # <- Gzip doesn't support $.
|
|
65
|
+
case lno % 4
|
|
66
|
+
when 1
|
|
67
|
+
ln =~ /^@(\S+)/ or
|
|
68
|
+
raise Enveomics::ParseError.new("Unexpected defline format: #{ln}")
|
|
69
|
+
entry << ln
|
|
70
|
+
when 2, 3
|
|
71
|
+
entry << ln
|
|
72
|
+
when 0
|
|
73
|
+
entry << ln
|
|
74
|
+
q = entry[3].chomp.split('').map { |i| (i.ord - o[:offset]) }
|
|
75
|
+
q.map { |i| i < o[:qual] }.each_with_index { |i, k| entry[1][k] = 'N' if i }
|
|
76
|
+
ofh.puts(o[:fasta] ? [entry[0].gsub(/^@/, '>'), entry[1]] : entry)
|
|
77
|
+
entry = []
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Finalize
|
|
82
|
+
say " Lines: #{lno}"
|
|
83
|
+
unless entry.empty?
|
|
84
|
+
raise Enveomics::ParseError.new('Unexpected trailing lines in FastQ')
|
|
85
|
+
end
|
|
86
|
+
say " Sequences: #{lno / 4}"
|
|
87
|
+
ifh.close
|
|
88
|
+
ofh.close
|
|
89
|
+
|