miga-base 0.3.9.0 → 0.3.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/actions/add.rb +33 -33
- data/actions/edit.rb +33 -0
- data/actions/new.rb +17 -18
- data/actions/next_step.rb +33 -0
- data/actions/run.rb +15 -12
- data/bin/miga +43 -37
- data/lib/miga/daemon.rb +2 -2
- data/lib/miga/project/result.rb +16 -1
- data/lib/miga/version.rb +2 -2
- data/scripts/aai_distances.bash +1 -3
- data/scripts/ani_distances.bash +1 -3
- data/scripts/assembly.bash +1 -3
- data/scripts/cds.bash +1 -3
- data/scripts/clade_finding.bash +1 -3
- data/scripts/d.bash +13 -0
- data/scripts/distances.bash +1 -3
- data/scripts/essential_genes.bash +1 -3
- data/scripts/haai_distances.bash +1 -3
- data/scripts/miga.bash +12 -9
- data/scripts/mytaxa.bash +1 -3
- data/scripts/mytaxa_scan.bash +1 -3
- data/scripts/ogs.bash +36 -33
- data/scripts/p.bash +23 -0
- data/scripts/project_stats.bash +1 -3
- data/scripts/read_quality.bash +1 -3
- data/scripts/ssu.bash +1 -3
- data/scripts/stats.bash +1 -3
- data/scripts/subclades.bash +1 -3
- data/scripts/taxonomy.bash +1 -3
- data/scripts/trimmed_fasta.bash +1 -3
- data/scripts/trimmed_reads.bash +1 -3
- data/test/daemon_test.rb +3 -3
- data/utils/distance/runner.rb +1 -1
- data/utils/enveomics/Docs/recplot2.md +13 -2
- data/utils/enveomics/Examples/aai-matrix.bash +3 -3
- data/utils/enveomics/Examples/ani-matrix.bash +3 -3
- data/utils/enveomics/Makefile +2 -2
- data/utils/enveomics/Manifest/Tasks/blasttab.json +12 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +135 -0
- data/utils/enveomics/Manifest/Tasks/other.json +49 -0
- data/utils/enveomics/Manifest/categories.json +4 -0
- data/utils/enveomics/Manifest/examples.json +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
- data/utils/enveomics/Scripts/BlastTab.catsbj.pl +63 -65
- data/utils/enveomics/Scripts/BlastTab.recplot2.R +4 -2
- data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
- data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
- data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
- data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
- data/utils/enveomics/Scripts/aai.rb +4 -3
- data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
- data/utils/enveomics/enveomics.R/DESCRIPTION +1 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +3 -3
- data/utils/enveomics/enveomics.R/R/recplot.R +2 -3
- data/utils/enveomics/enveomics.R/R/recplot2.R +221 -160
- data/utils/enveomics/enveomics.R/R/utils.R +19 -1
- data/utils/enveomics/enveomics.R/README.md +11 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +2 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -0
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +10 -8
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +1 -1
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +26 -0
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +22 -0
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +13 -7
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +3 -4
- data/utils/subclade/runner.rb +4 -0
- metadata +14 -3
@@ -41,14 +41,17 @@
|
|
41
41
|
"FastQ.test-error.rb"
|
42
42
|
],
|
43
43
|
"Manipulation": [
|
44
|
+
"FastA.extract.rb",
|
44
45
|
"FastA.filter.pl",
|
45
46
|
"FastA.filterLen.pl",
|
46
47
|
"FastA.filterN.pl",
|
47
48
|
"FastA.fragment.rb",
|
48
49
|
"FastA.interpose.pl",
|
50
|
+
"FastA.mask.rb",
|
49
51
|
"FastA.per_file.pl",
|
50
52
|
"FastA.rename.pl",
|
51
53
|
"FastA.revcom.pl",
|
54
|
+
"FastA.sample.rb",
|
52
55
|
"FastA.slider.pl",
|
53
56
|
"FastA.split.pl",
|
54
57
|
"FastA.split.rb",
|
@@ -143,6 +146,7 @@
|
|
143
146
|
"BlastTab.catsbj.pl",
|
144
147
|
"BlastTab.pairedHits.rb",
|
145
148
|
"BlastTab.recplot2.R",
|
149
|
+
"GFF.catsbj.pl",
|
146
150
|
"RecPlot2.compareIdentities.R"
|
147
151
|
]
|
148
152
|
}
|
@@ -57,7 +57,7 @@
|
|
57
57
|
"description": ["Generates recruitment plots for a comparison",
|
58
58
|
"between a virome containing HIV and the HIV-1 genome."],
|
59
59
|
"values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,null,
|
60
|
-
null,null,"hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
|
60
|
+
null,null,null,"hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
|
61
61
|
},
|
62
62
|
{
|
63
63
|
"_": "== Examples of functional annotations ==",
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
|
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
|
@@ -1,9 +1,7 @@
|
|
1
1
|
#!/usr/bin/env perl
|
2
|
-
|
3
|
-
# @author: Luis M. Rodriguez-R
|
4
|
-
# @
|
5
|
-
# @license: artistic license 2.0
|
6
|
-
#
|
2
|
+
|
3
|
+
# @author: Luis M. Rodriguez-R
|
4
|
+
# @license: Artistic-2.0
|
7
5
|
|
8
6
|
use warnings;
|
9
7
|
use strict;
|
@@ -13,29 +11,29 @@ use Getopt::Std;
|
|
13
11
|
sub HELP_MESSAGE { die "
|
14
12
|
|
15
13
|
Description:
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
Generates a list of hits from a BLAST result concatenating the subject
|
15
|
+
sequences. This can be used, e.g., to analyze BLAST results against
|
16
|
+
draft genomes.
|
19
17
|
|
20
18
|
Usage:
|
21
|
-
|
19
|
+
$0 [options] seq.fa map.bls
|
22
20
|
|
23
|
-
|
24
|
-
|
21
|
+
seq.fa Subject sequences (ref) in FastA format.
|
22
|
+
map.bls Mapping of the reads to the reference in BLAST Tabular
|
25
23
|
format.
|
26
24
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
By default, it expects all the subjects to be present in
|
32
|
-
|
33
|
-
|
34
|
-
|
25
|
+
Options:
|
26
|
+
-i <float> Minimum identity to report a result. By default: 70.
|
27
|
+
-l <int> Minimum alignment length to report a result. By default: 60.
|
28
|
+
-s The FastA provided is to be treated as a subset of the subject.
|
29
|
+
By default, it expects all the BLAST subjects to be present in
|
30
|
+
the FastA.
|
31
|
+
-q Run quietly.
|
32
|
+
-h Display this message and exit.
|
35
33
|
|
36
|
-
|
37
|
-
|
38
|
-
|
34
|
+
This script creates two files using <map.bls> as prefix with extensions
|
35
|
+
.rec (for the recruitment plot) and .lim (for the limits of the different
|
36
|
+
sequences in <seq.fa>).
|
39
37
|
|
40
38
|
";}
|
41
39
|
|
@@ -51,56 +49,56 @@ my %seq = ();
|
|
51
49
|
my @seq = ();
|
52
50
|
my $tot = 0;
|
53
51
|
|
54
|
-
SEQ:{
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
52
|
+
SEQ: {
|
53
|
+
print STDERR "== Reading reference sequences\n" unless $o{q};
|
54
|
+
open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
|
55
|
+
my $cur_seq = '';
|
56
|
+
while(<FA>){
|
57
|
+
chomp;
|
58
|
+
if(m/^>(\S+)/){
|
59
|
+
my $c = $1;
|
60
|
+
$seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
|
61
|
+
push @seq, $c;
|
62
|
+
$cur_seq = $c;
|
63
|
+
}else{
|
64
|
+
s/[^A-Za-z]//g;
|
65
|
+
$seq{$cur_seq} += length $_;
|
66
|
+
}
|
67
|
+
}
|
68
|
+
close FA;
|
69
|
+
print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
|
72
70
|
}
|
73
71
|
|
74
72
|
open LIM, ">", "$map.lim" or die "Cannot create the file: $map.lim: $!\n";
|
75
73
|
my $l = 0;
|
76
74
|
for my $s (@seq){
|
77
|
-
|
78
|
-
|
75
|
+
print LIM "$s\t".(++$l)."\t$seq{$s}\n";
|
76
|
+
($l, $seq{$s}) = ($seq{$s}, $l);
|
79
77
|
}
|
80
78
|
close LIM;
|
81
79
|
|
82
|
-
MAP:{
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
80
|
+
MAP: {
|
81
|
+
print STDERR "== Reading mapping\n" unless $o{q};
|
82
|
+
open BLS, "<", $map or die "Cannot read the file: $map: $!\n";
|
83
|
+
open REC, ">", "$map.rec" or die "Cannot create the file: $map.rec: $!\n";
|
84
|
+
RESULT: while(<BLS>){
|
85
|
+
chomp;
|
86
|
+
my @ln = split /\t/;
|
87
|
+
$ln[11] or die "Cannot parse line $map:$.: $_\n";
|
88
|
+
next unless $ln[3]>=$o{l};
|
89
|
+
next unless $ln[2]>=$o{i};
|
90
|
+
unless(exists $seq{$ln[1]}){
|
91
|
+
die "Cannot find the subject sequence: $ln[1]\n" unless $o{s};
|
92
|
+
next RESULT;
|
93
|
+
}
|
94
|
+
my $start = $seq{$ln[1]}+min($ln[8], $ln[9]);
|
95
|
+
my $end = $seq{$ln[1]}+max($ln[8], $ln[9]);
|
96
|
+
print REC "$start\t$end\t$ln[2]\t$ln[11]\t$ln[0]",
|
97
|
+
(exists($ln[13])?"\t".($ln[2]*$ln[3]/min($ln[12],$ln[13]))."\t":
|
98
|
+
exists($ln[12])?"\t".($ln[2]*$ln[3]/$ln[12])."\t":""),"\n";
|
99
|
+
}
|
100
|
+
close BLS;
|
101
|
+
close REC;
|
102
|
+
print STDERR " done.\n" unless $o{q};
|
105
103
|
}
|
106
104
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env Rscript
|
2
2
|
|
3
3
|
# @author Luis M. Rodriguez-R
|
4
|
-
# @license
|
4
|
+
# @license Artistic-2.0
|
5
5
|
|
6
6
|
#= Load stuff
|
7
7
|
suppressPackageStartupMessages(library(enveomics.R))
|
@@ -17,6 +17,7 @@ opt <- enve.cliopts(enve.recplot2,
|
|
17
17
|
usage="usage: %prog [options] output.Rdata [output.pdf [width height]]",
|
18
18
|
mandatory=c("prefix"),
|
19
19
|
o_desc=list(pos.breaks="Breaks in the positions histogram.",
|
20
|
+
pos.breaks.tsv="File with (absolute) coordinates of breaks in the position histogram",
|
20
21
|
id.breaks="Breaks in the identity histogram.",
|
21
22
|
id.summary="Function summarizing the identity bins. By default: sum.",
|
22
23
|
peaks.col="Color of peaks, mandatory for peak-finding (e.g., darkred).",
|
@@ -24,7 +25,8 @@ opt <- enve.cliopts(enve.recplot2,
|
|
24
25
|
p_desc=paste("","Produce recruitment plot objects provided that",
|
25
26
|
"BlastTab.catsbj.pl has been previously executed.", sep="\n\t"),
|
26
27
|
ignore=c("plot"),
|
27
|
-
defaults=c(id.metric="identity", peaks.col=NA,
|
28
|
+
defaults=c(pos.breaks.tsv=NA, id.metric="identity", peaks.col=NA,
|
29
|
+
peaks.method="emauto"))
|
28
30
|
|
29
31
|
#= Run it!
|
30
32
|
if(length(opt$args)>1){
|
@@ -0,0 +1,152 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
5
|
+
# @license Artistic-2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'optparse'
|
9
|
+
|
10
|
+
o = {q: false}
|
11
|
+
ARGV << '-h' if ARGV.size==0
|
12
|
+
|
13
|
+
OptionParser.new do |opt|
|
14
|
+
opt.banner = "
|
15
|
+
Extracts a list of sequences and/or coordinates from multi-FastA files.
|
16
|
+
|
17
|
+
Usage: #{$0} [options]"
|
18
|
+
opt.separator ''
|
19
|
+
opt.separator 'Mandatory'
|
20
|
+
opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
|
21
|
+
opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
|
22
|
+
opt.on('-c', '--coords STRING',
|
23
|
+
'Comma-delimited list of coordinates (mandatory unless -C is passed).',
|
24
|
+
'The format of the coordinates is "SEQ:FROM..TO" or "SEQ:FROM~LEN":',
|
25
|
+
'SEQ: Sequence ID, or * (asterisk) to extract range from all sequences',
|
26
|
+
'FROM: Integer, position of the first base to include (can be negative)',
|
27
|
+
'TO: Integer, last base to include (can be negative)',
|
28
|
+
'LEN: Length of the range to extract'
|
29
|
+
){ |v| o[:c] = v }
|
30
|
+
opt.separator ''
|
31
|
+
opt.separator 'Options'
|
32
|
+
opt.on('-C', '--coords-file PATH',
|
33
|
+
'File containing the coordinates, one per line.',
|
34
|
+
'Each line must follow the format described for -c.'){ |v| o[:C] = v }
|
35
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
36
|
+
opt.on('-h', '--help', 'Display this screen.') do
|
37
|
+
puts opt
|
38
|
+
exit
|
39
|
+
end
|
40
|
+
opt.separator ''
|
41
|
+
end.parse!
|
42
|
+
abort '-i is mandatory.' if o[:i].nil?
|
43
|
+
abort '-o is mandatory.' if o[:o].nil?
|
44
|
+
abort '-c is mandatory.' if o[:c].nil? and o[:C].nil?
|
45
|
+
|
46
|
+
# Classses to parse coordinates
|
47
|
+
class SeqCoords
|
48
|
+
attr :id, :from, :to, :length, :str
|
49
|
+
def initialize(str)
|
50
|
+
@str = str
|
51
|
+
m = /(\S+):(-?\d+)(~|\.\.)(-?\d+)/.match str
|
52
|
+
raise "Cannot parse coordinates: #{str}" if m.nil?
|
53
|
+
@id = m[1]
|
54
|
+
@from = m[2].to_i
|
55
|
+
if m[3] == '~'
|
56
|
+
@length = m[4].to_i
|
57
|
+
else
|
58
|
+
@to = m[4].to_i
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def extract(id, seq)
|
63
|
+
return nil unless concerns? id
|
64
|
+
from_i = from > 0 ? from : seq.length + 1 + from
|
65
|
+
if to.nil?
|
66
|
+
seq[from_i, length]
|
67
|
+
else
|
68
|
+
to_i = to > 0 ? to : seq.length + 1 + to
|
69
|
+
seq[from_i .. to_i]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def concerns?(seq_id)
|
74
|
+
return true if id == '*'
|
75
|
+
return id == seq_id
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
class SeqCoordsCollection
|
80
|
+
class << self
|
81
|
+
def from_str(str)
|
82
|
+
c = new
|
83
|
+
str.split(',').each { |i| c << SeqCoords.new(i) }
|
84
|
+
c
|
85
|
+
end
|
86
|
+
def from_file(path)
|
87
|
+
c = new
|
88
|
+
File.open(path, 'r') do |fh|
|
89
|
+
fh.each{ |i| c << SeqCoords.new(i.chomp) }
|
90
|
+
end
|
91
|
+
c
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
attr :collection
|
96
|
+
|
97
|
+
def initialize
|
98
|
+
@collection = []
|
99
|
+
end
|
100
|
+
|
101
|
+
def <<(coords)
|
102
|
+
@collection << coords
|
103
|
+
end
|
104
|
+
|
105
|
+
def extract(id, seq)
|
106
|
+
@collection.map{ |c| c.extract(id, seq) }.compact
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# Functions to parse sequences
|
111
|
+
def do_stuff(id, sq)
|
112
|
+
return if id.nil? or sq.empty?
|
113
|
+
@n_in += 1
|
114
|
+
sq.gsub!(/[^A-Za-z]/, '')
|
115
|
+
i = 0
|
116
|
+
@coll.extract(id, sq).each do |new_sq|
|
117
|
+
@ofh.puts ">#{id}:#{i += 1}"
|
118
|
+
@ofh.puts new_sq
|
119
|
+
@n_out += 1
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Parse coordinates
|
124
|
+
$stderr.puts 'Parsing coordinates' unless o[:q]
|
125
|
+
@coll = o[:c].nil? ? SeqCoordsCollection.from_file(o[:C]) :
|
126
|
+
SeqCoordsCollection.from_str(o[:c])
|
127
|
+
$stderr.puts " Coordinates found: #{@coll.collection.size}"
|
128
|
+
|
129
|
+
# Parse sequences
|
130
|
+
$stderr.puts 'Parsing sequences' unless o[:q]
|
131
|
+
@n_in = 0
|
132
|
+
@n_out = 0
|
133
|
+
@ofh = File.open(o[:o], 'w')
|
134
|
+
File.open(o[:i], 'r') do |fh|
|
135
|
+
id = nil
|
136
|
+
sq = ''
|
137
|
+
fh.each do |ln|
|
138
|
+
next if ln =~ /^;/
|
139
|
+
if ln =~ /^>(\S+)/
|
140
|
+
id = $1
|
141
|
+
do_stuff(id, sq)
|
142
|
+
sq = ''
|
143
|
+
else
|
144
|
+
sq << ln
|
145
|
+
end
|
146
|
+
end
|
147
|
+
do_stuff(id, sq)
|
148
|
+
end
|
149
|
+
@ofh.close
|
150
|
+
$stderr.puts " Input sequences: #{@n_in}"
|
151
|
+
$stderr.puts " Output fragments: #{@n_out}"
|
152
|
+
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
o = {x: 'N', trim: false, wrap: 70}
|
5
|
+
ARGV << '-h' if ARGV.empty?
|
6
|
+
OptionParser.new do |opts|
|
7
|
+
opts.banner = "
|
8
|
+
Mask sequence region(s) in a FastA file.
|
9
|
+
|
10
|
+
Usage: #{$0} [options]"
|
11
|
+
opts.separator ''
|
12
|
+
opts.separator 'Mandatory'
|
13
|
+
opts.on('-i', '--in FILE', 'Input FastA file.'){ |v| o[:in] = v }
|
14
|
+
opts.on('-o', '--out FILE', 'Output FastA file.'){ |v| o[:out] = v }
|
15
|
+
opts.on('-r', '--regions REG1,REG2,...', Array,
|
16
|
+
'Regions to mask separated by commas.',
|
17
|
+
'Each region must be in the format "sequence_id:from..to"'
|
18
|
+
){ |v| o[:reg] = v }
|
19
|
+
opts.separator ''
|
20
|
+
opts.separator 'Options'
|
21
|
+
opts.on('-x', '--symbol CHAR',
|
22
|
+
'Character used to mask the region(s)',
|
23
|
+
"By default: #{o[:x]}."){ |v| o[:x] = v }
|
24
|
+
opts.on('-t', '--trim',
|
25
|
+
'Trim masked regions extending to the edge of a sequence'
|
26
|
+
){ |v| o[:trim] = v }
|
27
|
+
opts.on('-w', '--wrap INT',
|
28
|
+
'Line length to wrap sequences. Use 0 to generate 1-line sequences.',
|
29
|
+
"By default: #{o[:wrap]}."){ |v| o[:wrap] = v.to_i }
|
30
|
+
opts.on('-h', '--help', 'Display this screen.') do
|
31
|
+
puts opts
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
opts.separator ''
|
35
|
+
end.parse!
|
36
|
+
abort '-i is mandatory' if o[:in].nil?
|
37
|
+
abort '-o is mandatory' if o[:out].nil?
|
38
|
+
abort '-r is mandatory' if o[:reg].nil?
|
39
|
+
|
40
|
+
def wrap_width(txt, len)
|
41
|
+
return "" if txt.empty?
|
42
|
+
return "#{txt}\n" if len==0
|
43
|
+
txt.gsub(/(.{1,#{len}})/,"\\1\n")
|
44
|
+
end
|
45
|
+
|
46
|
+
# Read input sequences
|
47
|
+
sq = {}
|
48
|
+
File.open(o[:in], 'r') do |ifh|
|
49
|
+
bf = ''
|
50
|
+
ifh.each('>') do |i|
|
51
|
+
(dln, seq) = i.split(/[\n\r]+/, 2)
|
52
|
+
next if seq.nil?
|
53
|
+
id = dln.gsub(/\s.*/, '')
|
54
|
+
seq.gsub!(/[\s>]/, '')
|
55
|
+
sq[id] = [dln, seq]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Parse coordinates and mask regions
|
60
|
+
last_id = nil
|
61
|
+
o[:reg].each do |i|
|
62
|
+
m = i.match(/^(?:(.+):)?(\d+)\.\.(\d+)$/) or
|
63
|
+
abort "Unexpected region format: #{i}"
|
64
|
+
r = [m[1], m[2].to_i-1, m[3].to_i-1]
|
65
|
+
if r[0].nil?
|
66
|
+
abort "Region missing sequence ID: #{i}" if last_id.nil?
|
67
|
+
r[0] = last_id
|
68
|
+
end
|
69
|
+
last_id = r[0]
|
70
|
+
sq[r[0]] or abort "Cannot find sequence #{r[0]}"
|
71
|
+
r[1] <= r[2] or abort "Malformed range: #{i}"
|
72
|
+
if r[1] < 0 or r[2] > sq[r[0]][1].size
|
73
|
+
abort "Range extends beyond the edge of the sequence: #{i}"
|
74
|
+
end
|
75
|
+
sq[r[0]][1][r[1] .. r[2]] = o[:x]*(1+r[2]-r[1])
|
76
|
+
end
|
77
|
+
|
78
|
+
# Trim sequences and generate output
|
79
|
+
ofh = File.open(o[:out], 'w')
|
80
|
+
sq.each do |_k,v|
|
81
|
+
ofh.puts ">#{v[0]}"
|
82
|
+
if o[:trim]
|
83
|
+
v[1].gsub!(/^#{o[:x]}+/,'')
|
84
|
+
v[1].gsub!(/#{o[:x]}+$/,'')
|
85
|
+
end
|
86
|
+
ofh.print wrap_width(v[1], o[:wrap])
|
87
|
+
end
|
88
|
+
ofh.close
|
89
|
+
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#
|
4
|
+
# @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
5
|
+
# @license Artistic-2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'optparse'
|
9
|
+
|
10
|
+
o = {q: false, rep: false}
|
11
|
+
ARGV << '-h' if ARGV.size==0
|
12
|
+
|
13
|
+
OptionParser.new do |opt|
|
14
|
+
opt.banner = "
|
15
|
+
Samples a random set of sequences from a multi-FastA file.
|
16
|
+
|
17
|
+
Usage: #{$0} [options]"
|
18
|
+
opt.separator ''
|
19
|
+
opt.separator 'Mandatory'
|
20
|
+
opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
|
21
|
+
opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
|
22
|
+
opt.on('-f', '--fraction FLOAT',
|
23
|
+
'Fraction of sequences to sample [0-1].',
|
24
|
+
'Mandatory unless -n is provided.'){ |v| o[:f] = v.to_f }
|
25
|
+
opt.separator ''
|
26
|
+
opt.separator 'Options'
|
27
|
+
opt.on('-c', '--number INT',
|
28
|
+
'Number of sequences to sample.',
|
29
|
+
'Mandatory unless -f is provided.'){ |v| o[:n] = v.to_i }
|
30
|
+
opt.on('-r', '--replacement','Sample with replacement'){ |v| o[:rep] = v }
|
31
|
+
opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
32
|
+
opt.on('-h', '--help', 'Display this screen.') do
|
33
|
+
puts opt
|
34
|
+
exit
|
35
|
+
end
|
36
|
+
opt.separator ''
|
37
|
+
end.parse!
|
38
|
+
abort '-i is mandatory.' if o[:i].nil?
|
39
|
+
abort '-o is mandatory.' if o[:o].nil?
|
40
|
+
abort '-f or -n is mandatory.' if o[:f].nil? and o[:n].nil?
|
41
|
+
|
42
|
+
# Functions to parse sequences
|
43
|
+
def do_stuff(id, sq)
|
44
|
+
return if id.nil? or sq.empty?
|
45
|
+
@n_in += 1
|
46
|
+
sq.gsub!(/[^A-Za-z]/, '')
|
47
|
+
i = 0
|
48
|
+
@coll.extract(id, sq).each do |new_sq|
|
49
|
+
@ofh.puts ">#{id}:#{i += 1}"
|
50
|
+
@ofh.puts new_sq
|
51
|
+
@n_out += 1
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Parse sequences
|
56
|
+
$stderr.puts 'Parsing sequences' unless o[:q]
|
57
|
+
seq = []
|
58
|
+
File.open(o[:i], 'r') do |fh|
|
59
|
+
id = nil
|
60
|
+
sq = ''
|
61
|
+
fh.each do |ln|
|
62
|
+
next if ln =~ /^;/
|
63
|
+
if ln =~ /^>(.+)/
|
64
|
+
seq << [id, sq] unless id.nil?
|
65
|
+
id = $1
|
66
|
+
sq = ''
|
67
|
+
else
|
68
|
+
sq << ln
|
69
|
+
end
|
70
|
+
end
|
71
|
+
seq << [id, sq] unless id.nil?
|
72
|
+
end
|
73
|
+
$stderr.puts " Input sequences: #{seq.size}"
|
74
|
+
o[:n] ||= (seq.size * o[:f]).round
|
75
|
+
seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
|
76
|
+
File.open(o[:o], 'w') do |fh|
|
77
|
+
seq_o.each do |i|
|
78
|
+
fh.puts ">#{i[0]}"
|
79
|
+
fh.puts i[1]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
$stderr.puts " Output sequences: #{seq_o.size}"
|
83
|
+
|