miga-base 0.7.26.2 → 1.0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/classify_wf.rb +2 -2
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/doctor.rb +57 -14
- data/lib/miga/cli/action/doctor/base.rb +47 -23
- data/lib/miga/cli/action/env.rb +26 -0
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/cli/base.rb +1 -0
- data/lib/miga/common.rb +1 -0
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +5 -4
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +36 -0
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +10 -2
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +12 -8
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/test/remote_dataset_test.rb +1 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +0 -1
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- data/utils/subclade/pipeline.rb +2 -2
- metadata +33 -4
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -1,9 +1,8 @@
|
|
1
1
|
#!/usr/bin/env perl
|
2
|
-
|
2
|
+
|
3
3
|
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
-
# @
|
5
|
-
|
6
|
-
#
|
4
|
+
# @license: Artistic-2.0
|
5
|
+
|
7
6
|
use strict;
|
8
7
|
use warnings;
|
9
8
|
use List::Util qw/sum min max/;
|
@@ -11,46 +10,51 @@ use List::Util qw/sum min max/;
|
|
11
10
|
my ($seqs, $minlen, $n__) = @ARGV;
|
12
11
|
$seqs or die "
|
13
12
|
Description:
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
Calculates the N50 value of a set of sequences. Alternatively, it
|
14
|
+
can calculate other N** values. It also calculates the total number
|
15
|
+
of sequences, the total added length, and the longest sequence length.
|
16
|
+
|
18
17
|
Usage:
|
19
|
-
|
18
|
+
$0 seqs.fa [minlen [**]]
|
19
|
+
|
20
|
+
seqs.fa A FastA file containing the sequences
|
21
|
+
minlen (optional) The minimum length to take into consideration
|
22
|
+
By default: 0
|
23
|
+
** (optional) Value N** to calculate. By default: 50 (N50)
|
20
24
|
|
21
|
-
seqs.fa A FastA file containing the sequences.
|
22
|
-
minlen (optional) The minimum length to take into consideration.
|
23
|
-
By default: 0.
|
24
|
-
** Value N** to calculate. By default: 50 (N50).
|
25
25
|
";
|
26
|
+
|
26
27
|
$minlen ||= 0;
|
27
28
|
$n__ ||= 50;
|
28
29
|
|
29
30
|
my @len = ();
|
30
31
|
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
31
32
|
while(<SEQ>){
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
33
|
+
if(/^>/){
|
34
|
+
push @len, 0;
|
35
|
+
}else{
|
36
|
+
next if /^;/;
|
37
|
+
chomp;
|
38
|
+
s/\W//g;
|
39
|
+
$len[-1] += length $_;
|
40
|
+
}
|
40
41
|
}
|
41
42
|
close SEQ;
|
42
|
-
|
43
|
+
|
44
|
+
@len = sort { $a <=> $b } map { $_ >= $minlen ? $_ : () } @len;
|
43
45
|
my $tot = (sum(@len) || 0);
|
44
46
|
|
45
|
-
my $thr = $n__
|
47
|
+
my $thr = $n__ * $tot / 100;
|
46
48
|
my $pos = 0;
|
47
49
|
for(@len){
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
50
|
+
$pos += $_;
|
51
|
+
if($pos >= $thr){
|
52
|
+
print "N$n__: $_\n";
|
53
|
+
last;
|
54
|
+
}
|
53
55
|
}
|
54
|
-
|
56
|
+
|
57
|
+
print "Sequences: " . scalar(@len) . "\n";
|
55
58
|
print "Total length: $tot\n";
|
59
|
+
print "Longest sequence: " . pop(@len) . "\n";
|
56
60
|
|
@@ -1,92 +1,100 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
|
-
# @license artistic license 2.0
|
3
|
+
# frozen_string_literal: true
|
5
4
|
|
6
|
-
$:.push File.expand_path(
|
7
|
-
require
|
8
|
-
require
|
5
|
+
$:.push File.expand_path('../lib', __FILE__)
|
6
|
+
require 'enveomics_rb/enveomics'
|
7
|
+
require 'enveomics_rb/stats'
|
8
|
+
$VERSION = 1.0
|
9
9
|
|
10
|
-
o = {q:false, completeness:nil, minlen:500, shuffle:true}
|
10
|
+
o = { q: false, completeness: nil, minlen: 500, shuffle: true }
|
11
11
|
OptionParser.new do |opts|
|
12
|
-
opts.
|
13
|
-
|
12
|
+
opts.version = $VERSION
|
13
|
+
Enveomics.opt_banner(
|
14
|
+
opts, 'Simulates incomplete (fragmented) drafts from complete genomes',
|
15
|
+
"#{File.basename($0)} -i in.fasta -o out.fasta -c 0.5 [options]"
|
16
|
+
)
|
14
17
|
|
15
|
-
|
16
|
-
opts.
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
opts.on(
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
opts.
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
opts.
|
31
|
-
|
32
|
-
opts.on(
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
18
|
+
opts.separator 'Mandatory'
|
19
|
+
opts.on(
|
20
|
+
'-i', '--in FILE',
|
21
|
+
'Path to the FastA file containing the complete sequences',
|
22
|
+
'Supports compression with .gz extension, use - for STDIN'
|
23
|
+
) { |v| o[:in] = v }
|
24
|
+
opts.on(
|
25
|
+
'-o', '--out FILE', 'Path to the FastA to create',
|
26
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
27
|
+
) { |v| o[:out] = v }
|
28
|
+
opts.on(
|
29
|
+
'-c', '--completeness FLOAT',
|
30
|
+
'Fraction of genome completeness to simulate from 0 to 1'
|
31
|
+
) { |v| o[:completeness] = v.to_f }
|
32
|
+
|
33
|
+
opts.separator ''
|
34
|
+
opts.separator 'Options'
|
35
|
+
opts.on(
|
36
|
+
'-m', '--minlen INT',
|
37
|
+
"Minimum fragment length to report. By default: #{o[:minlen]}"
|
38
|
+
) { |v| o[:minlen] = v.to_i }
|
39
|
+
opts.on(
|
40
|
+
'-s', '--sorted', 'Keep fragments sorted as in the input file',
|
41
|
+
'By default, fragments are shuffled'
|
42
|
+
) { |v| o[:shuffle] = !v }
|
43
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
44
|
+
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
45
|
+
opts.separator ''
|
38
46
|
end.parse!
|
39
|
-
|
40
|
-
|
41
|
-
|
47
|
+
|
48
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
49
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
50
|
+
raise Enveomics::OptionError.new('-c is mandatory') if o[:completeness].nil?
|
42
51
|
|
43
52
|
begin
|
44
53
|
# Read input sequences
|
45
54
|
g_id = []
|
46
55
|
g_seq = []
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
56
|
+
ifh = reader(o[:in])
|
57
|
+
id = ''
|
58
|
+
ifh.each_line do |ln|
|
59
|
+
if ln =~ /^>(\S*)/
|
60
|
+
g_id << $1
|
61
|
+
g_seq << ''
|
62
|
+
else
|
63
|
+
g_seq[g_seq.size - 1] += ln.gsub(/[^A-Za-z]/, '')
|
56
64
|
end
|
57
65
|
end
|
58
|
-
|
66
|
+
ifh.close
|
67
|
+
|
59
68
|
# Fragment genomes
|
60
69
|
f = {}
|
61
70
|
binlen = [1, (o[:minlen].to_f/(1.5**2)).ceil].max
|
62
71
|
p = [0.001, [1.0, 1.0 - (o[:completeness]/1.25 + 0.1)].min].max
|
63
|
-
while
|
72
|
+
while !g_seq.empty?
|
64
73
|
id = g_id.shift
|
65
74
|
seq = g_seq.shift
|
66
75
|
gL = seq.length
|
67
|
-
while
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
seq =
|
76
|
+
while !seq.empty?
|
77
|
+
rand_x =
|
78
|
+
Enveomics::Stats.r_geom(p).to_f + Enveomics::Stats.r_unif(-0.5, 0.5)
|
79
|
+
fL = [0, (rand_x * binlen).round].max
|
80
|
+
f["#{f.size+1}_#{id}"] = seq[0, fL] if fL >= o[:minlen]
|
81
|
+
seq = seq[(fL + 1) .. -1]
|
82
|
+
seq = '' if seq.nil?
|
73
83
|
end
|
74
84
|
end
|
75
85
|
|
76
86
|
# Save output
|
77
87
|
k = f.keys
|
78
88
|
k.shuffle! if o[:shuffle]
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
end
|
89
|
+
ofh = writer(o[:out])
|
90
|
+
k.each do |id|
|
91
|
+
ofh.puts ">#{id}"
|
92
|
+
ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
|
84
93
|
end
|
85
|
-
|
94
|
+
ofh.close
|
86
95
|
rescue => err
|
87
|
-
|
88
|
-
|
89
|
-
|
96
|
+
$stderr.puts "Exception: #{err}\n\n"
|
97
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
98
|
+
err
|
90
99
|
end
|
91
100
|
|
92
|
-
|
@@ -1,43 +1,57 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
|
-
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
5
|
-
# @license Artistic-2.0
|
6
|
-
#
|
3
|
+
# frozen_string_literal: false
|
7
4
|
|
8
|
-
|
5
|
+
$VERSION = 1.0
|
6
|
+
$:.push File.expand_path('../lib', __FILE__)
|
7
|
+
require 'enveomics_rb/enveomics'
|
9
8
|
|
10
|
-
o = {q: false, rep: false}
|
11
|
-
ARGV << '-h' if ARGV.size==0
|
9
|
+
o = { q: false, rep: false }
|
12
10
|
|
13
11
|
OptionParser.new do |opt|
|
14
|
-
|
15
|
-
Samples a random set of sequences from a multi-FastA file
|
16
|
-
|
17
|
-
|
18
|
-
opt.separator ''
|
12
|
+
Enveomics.opt_banner(
|
13
|
+
opt, 'Samples a random set of sequences from a multi-FastA file',
|
14
|
+
"#{File.basename($0)} -i seq.fa -o 10pc.fa -f 0.1 [options]"
|
15
|
+
)
|
19
16
|
opt.separator 'Mandatory'
|
20
|
-
opt.on(
|
21
|
-
|
22
|
-
|
17
|
+
opt.on(
|
18
|
+
'-i', '--in PATH',
|
19
|
+
'Input FastA file',
|
20
|
+
'Supports compression with .gz extension, use - for STDIN'
|
21
|
+
) { |v| o[:i] = v }
|
22
|
+
opt.on(
|
23
|
+
'-o', '--out PATH',
|
24
|
+
'Output FastA file',
|
25
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
26
|
+
) { |v| o[:o] = v }
|
27
|
+
opt.on(
|
28
|
+
'-f', '--fraction FLOAT', Float,
|
23
29
|
'Fraction of sequences to sample [0-1].',
|
24
|
-
'Mandatory unless -
|
30
|
+
'Mandatory unless -c is provided.'
|
31
|
+
) { |v| o[:f] = v }
|
25
32
|
opt.separator ''
|
33
|
+
|
26
34
|
opt.separator 'Options'
|
27
|
-
opt.on(
|
28
|
-
'
|
29
|
-
'
|
30
|
-
|
31
|
-
|
35
|
+
opt.on(
|
36
|
+
'-c', '--number INT', Integer,
|
37
|
+
'Number of sequences to sample',
|
38
|
+
'Mandatory unless -f is provided'
|
39
|
+
) { |v| o[:n] = v }
|
40
|
+
opt.on('-r', '--replacement','Sample with replacement') { |v| o[:rep] = v }
|
41
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
32
42
|
opt.on('-h', '--help', 'Display this screen.') do
|
33
43
|
puts opt
|
34
44
|
exit
|
35
45
|
end
|
36
46
|
opt.separator ''
|
37
47
|
end.parse!
|
38
|
-
|
39
|
-
|
40
|
-
|
48
|
+
|
49
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
|
50
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:o].nil?
|
51
|
+
if o[:f].nil? && o[:n].nil?
|
52
|
+
raise Enveomics::OptionError.new('-f or -n is mandatory')
|
53
|
+
end
|
54
|
+
$QUIET = o[:q]
|
41
55
|
|
42
56
|
# Functions to parse sequences
|
43
57
|
def do_stuff(id, sq)
|
@@ -53,31 +67,32 @@ def do_stuff(id, sq)
|
|
53
67
|
end
|
54
68
|
|
55
69
|
# Parse sequences
|
56
|
-
|
70
|
+
say 'Parsing sequences'
|
57
71
|
seq = []
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
end
|
72
|
+
fh = reader(o[:i])
|
73
|
+
id = nil
|
74
|
+
sq = ''
|
75
|
+
fh.each do |ln|
|
76
|
+
next if ln =~ /^;/
|
77
|
+
if ln =~ /^>(.+)/
|
78
|
+
seq << [id, sq] unless id.nil?
|
79
|
+
id = $1
|
80
|
+
sq = ''
|
81
|
+
else
|
82
|
+
sq << ln
|
70
83
|
end
|
71
|
-
seq << [id, sq] unless id.nil?
|
72
84
|
end
|
73
|
-
|
85
|
+
seq << [id, sq] unless id.nil?
|
86
|
+
fh.close
|
87
|
+
say "Input sequences: #{seq.size}"
|
88
|
+
|
74
89
|
o[:n] ||= (seq.size * o[:f]).round
|
75
|
-
seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
end
|
90
|
+
seq_o = o[:rep] ? o[:n].times.map { seq.sample } : seq.sample(o[:n])
|
91
|
+
fh = writer(o[:o])
|
92
|
+
seq_o.each do |i|
|
93
|
+
fh.puts ">#{i[0]}"
|
94
|
+
fh.puts i[1]
|
81
95
|
end
|
82
|
-
|
96
|
+
fh.close
|
97
|
+
say "Output sequences: #{seq_o.size}"
|
83
98
|
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
o = { qual: 31, encoding: 33 }
|
7
|
+
ARGV << '-h' if ARGV.empty?
|
8
|
+
OptionParser.new do |opts|
|
9
|
+
opts.banner = "
|
10
|
+
Creates a FastQ-compliant file from a FastA file.
|
11
|
+
|
12
|
+
Usage: #{$0} [options]"
|
13
|
+
opts.separator ''
|
14
|
+
opts.separator 'Options'
|
15
|
+
opts.on(
|
16
|
+
'-i', '--in FILE', 'Input FastA file (supports .gz compression)'
|
17
|
+
) { |v| o[:in] = v }
|
18
|
+
opts.on(
|
19
|
+
'-o', '--out FILE', 'Output FastQ file (supports .gz compression)'
|
20
|
+
) { |v| o[:out] = v }
|
21
|
+
opts.on(
|
22
|
+
'-q', '--quality INT', Integer,
|
23
|
+
'PHRED quality score to use (fixed), in the range [-5, 41]',
|
24
|
+
"By default: #{o[:qual]}"
|
25
|
+
) { |v| o[:qual] = v }
|
26
|
+
opts.on(
|
27
|
+
'--encoding INT', Integer,
|
28
|
+
"Base encoding (33 or 64). By default: #{o[:encoding]}"
|
29
|
+
) { |v| o[:encoding] = v }
|
30
|
+
opts.on('-h', '--help', 'Display this screen.') do
|
31
|
+
puts opts
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
opts.separator ''
|
35
|
+
end.parse!
|
36
|
+
abort '-i is mandatory' if o[:in].nil?
|
37
|
+
abort '-o is mandatory' if o[:out].nil?
|
38
|
+
abort '-q must be in the range -5 .. 41' if o[:qual] < -5 || o[:qual] > 41
|
39
|
+
|
40
|
+
# Determine quality character
|
41
|
+
$qchar = (o[:qual] + o[:encoding]).chr
|
42
|
+
|
43
|
+
# Create file handlers
|
44
|
+
ifh = o[:in] =~ /\.gz$/ ?
|
45
|
+
Zlib::GzipReader.open(o[:in]) : File.open(o[:in], 'r')
|
46
|
+
ofh = o[:out] =~ /\.gz$/ ?
|
47
|
+
Zlib::GzipWriter.open(o[:out]) : File.open(o[:out], 'w')
|
48
|
+
|
49
|
+
def print_seq(ofh, id, seq)
|
50
|
+
ofh.puts "@#{id}", seq, '+', $qchar * seq.length unless seq.empty?
|
51
|
+
end
|
52
|
+
|
53
|
+
# Generate FastQ
|
54
|
+
id = ''
|
55
|
+
seq = ''
|
56
|
+
ifh.each_line do |ln|
|
57
|
+
next if ln =~ /^;/
|
58
|
+
if ln =~ /^>(.*)/
|
59
|
+
print_seq(ofh, id, seq)
|
60
|
+
seq = ''
|
61
|
+
id = $1
|
62
|
+
else
|
63
|
+
seq += ln.chomp.upcase.gsub(/[^A-Z]/,'')
|
64
|
+
end
|
65
|
+
end
|
66
|
+
print_seq(ofh, id, seq)
|
67
|
+
ofh.close
|
68
|
+
ifh.close
|
69
|
+
|