miga-base 0.3.9.0 → 0.3.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/actions/add.rb +33 -33
  3. data/actions/edit.rb +33 -0
  4. data/actions/new.rb +17 -18
  5. data/actions/next_step.rb +33 -0
  6. data/actions/run.rb +15 -12
  7. data/bin/miga +43 -37
  8. data/lib/miga/daemon.rb +2 -2
  9. data/lib/miga/project/result.rb +16 -1
  10. data/lib/miga/version.rb +2 -2
  11. data/scripts/aai_distances.bash +1 -3
  12. data/scripts/ani_distances.bash +1 -3
  13. data/scripts/assembly.bash +1 -3
  14. data/scripts/cds.bash +1 -3
  15. data/scripts/clade_finding.bash +1 -3
  16. data/scripts/d.bash +13 -0
  17. data/scripts/distances.bash +1 -3
  18. data/scripts/essential_genes.bash +1 -3
  19. data/scripts/haai_distances.bash +1 -3
  20. data/scripts/miga.bash +12 -9
  21. data/scripts/mytaxa.bash +1 -3
  22. data/scripts/mytaxa_scan.bash +1 -3
  23. data/scripts/ogs.bash +36 -33
  24. data/scripts/p.bash +23 -0
  25. data/scripts/project_stats.bash +1 -3
  26. data/scripts/read_quality.bash +1 -3
  27. data/scripts/ssu.bash +1 -3
  28. data/scripts/stats.bash +1 -3
  29. data/scripts/subclades.bash +1 -3
  30. data/scripts/taxonomy.bash +1 -3
  31. data/scripts/trimmed_fasta.bash +1 -3
  32. data/scripts/trimmed_reads.bash +1 -3
  33. data/test/daemon_test.rb +3 -3
  34. data/utils/distance/runner.rb +1 -1
  35. data/utils/enveomics/Docs/recplot2.md +13 -2
  36. data/utils/enveomics/Examples/aai-matrix.bash +3 -3
  37. data/utils/enveomics/Examples/ani-matrix.bash +3 -3
  38. data/utils/enveomics/Makefile +2 -2
  39. data/utils/enveomics/Manifest/Tasks/blasttab.json +12 -4
  40. data/utils/enveomics/Manifest/Tasks/fasta.json +135 -0
  41. data/utils/enveomics/Manifest/Tasks/other.json +49 -0
  42. data/utils/enveomics/Manifest/categories.json +4 -0
  43. data/utils/enveomics/Manifest/examples.json +1 -1
  44. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
  45. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
  46. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
  47. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
  48. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +63 -65
  49. data/utils/enveomics/Scripts/BlastTab.recplot2.R +4 -2
  50. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  51. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  52. data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
  53. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  54. data/utils/enveomics/Scripts/aai.rb +4 -3
  55. data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
  56. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -2
  57. data/utils/enveomics/enveomics.R/NAMESPACE +3 -3
  58. data/utils/enveomics/enveomics.R/R/recplot.R +2 -3
  59. data/utils/enveomics/enveomics.R/R/recplot2.R +221 -160
  60. data/utils/enveomics/enveomics.R/R/utils.R +19 -1
  61. data/utils/enveomics/enveomics.R/README.md +11 -0
  62. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +2 -2
  63. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -0
  64. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -5
  65. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +10 -8
  66. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +1 -1
  67. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +26 -0
  68. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +22 -0
  69. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +13 -7
  70. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +3 -4
  71. data/utils/subclade/runner.rb +4 -0
  72. metadata +14 -3
@@ -41,14 +41,17 @@
41
41
  "FastQ.test-error.rb"
42
42
  ],
43
43
  "Manipulation": [
44
+ "FastA.extract.rb",
44
45
  "FastA.filter.pl",
45
46
  "FastA.filterLen.pl",
46
47
  "FastA.filterN.pl",
47
48
  "FastA.fragment.rb",
48
49
  "FastA.interpose.pl",
50
+ "FastA.mask.rb",
49
51
  "FastA.per_file.pl",
50
52
  "FastA.rename.pl",
51
53
  "FastA.revcom.pl",
54
+ "FastA.sample.rb",
52
55
  "FastA.slider.pl",
53
56
  "FastA.split.pl",
54
57
  "FastA.split.rb",
@@ -143,6 +146,7 @@
143
146
  "BlastTab.catsbj.pl",
144
147
  "BlastTab.pairedHits.rb",
145
148
  "BlastTab.recplot2.R",
149
+ "GFF.catsbj.pl",
146
150
  "RecPlot2.compareIdentities.R"
147
151
  ]
148
152
  }
@@ -57,7 +57,7 @@
57
57
  "description": ["Generates recruitment plots for a comparison",
58
58
  "between a virome containing HIV and the HIV-1 genome."],
59
59
  "values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,null,
60
- null,null,"hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
60
+ null,null,null,"hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
61
61
  },
62
62
  {
63
63
  "_": "== Examples of functional annotations ==",
@@ -1 +1 @@
1
- ../../Scripts/FastA.N50.pl
1
+ utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
@@ -1 +1 @@
1
- ../../Scripts/FastA.filterN.pl
1
+ utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
@@ -1 +1 @@
1
- ../../Scripts/FastA.length.pl
1
+ utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
@@ -1 +1 @@
1
- ../../Scripts/FastA.split.pl
1
+ utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
@@ -1,9 +1,7 @@
1
1
  #!/usr/bin/env perl
2
- #
3
- # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
- # @updated: Mar-23-2015
5
- # @license: artistic license 2.0
6
- #
2
+
3
+ # @author: Luis M. Rodriguez-R
4
+ # @license: Artistic-2.0
7
5
 
8
6
  use warnings;
9
7
  use strict;
@@ -13,29 +11,29 @@ use Getopt::Std;
13
11
  sub HELP_MESSAGE { die "
14
12
 
15
13
  Description:
16
- Generates a list of hits from a BLAST result concatenating the subject
17
- sequences. This can be used, e.g., to analyze BLAST results against
18
- draft genomes.
14
+ Generates a list of hits from a BLAST result concatenating the subject
15
+ sequences. This can be used, e.g., to analyze BLAST results against
16
+ draft genomes.
19
17
 
20
18
  Usage:
21
- $0 [options] seq.fa map.bls
19
+ $0 [options] seq.fa map.bls
22
20
 
23
- seq.fa Subject sequences (ref) in FastA format.
24
- map.bls Mapping of the reads to the reference in BLAST Tabular
21
+ seq.fa Subject sequences (ref) in FastA format.
22
+ map.bls Mapping of the reads to the reference in BLAST Tabular
25
23
  format.
26
24
 
27
- Options:
28
- -i <float> Minimum identity to report a result. By default: 70.
29
- -l <int> Minimum alignment length to report a result. By default: 60.
30
- -s The FastA provided is to be treated as a subset of the subject.
31
- By default, it expects all the subjects to be present in the
32
- BLAST.
33
- -q Run quietly.
34
- -h Display this message and exit.
25
+ Options:
26
+ -i <float> Minimum identity to report a result. By default: 70.
27
+ -l <int> Minimum alignment length to report a result. By default: 60.
28
+ -s The FastA provided is to be treated as a subset of the subject.
29
+ By default, it expects all the BLAST subjects to be present in
30
+ the FastA.
31
+ -q Run quietly.
32
+ -h Display this message and exit.
35
33
 
36
- This script creates two files using <map.bls> as prefix with extensions
37
- .rec (for the recruitment plot) and .lim (for the limits of the different
38
- sequences in <seq.fa>).
34
+ This script creates two files using <map.bls> as prefix with extensions
35
+ .rec (for the recruitment plot) and .lim (for the limits of the different
36
+ sequences in <seq.fa>).
39
37
 
40
38
  ";}
41
39
 
@@ -51,56 +49,56 @@ my %seq = ();
51
49
  my @seq = ();
52
50
  my $tot = 0;
53
51
 
54
- SEQ:{
55
- print STDERR "== Reading reference sequences\n" unless $o{q};
56
- open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
57
- my $cur_seq = '';
58
- while(<FA>){
59
- chomp;
60
- if(m/^>(\S+)/){
61
- my $c = $1;
62
- $seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
63
- push @seq, $c;
64
- $cur_seq = $c;
65
- }else{
66
- s/[^A-Za-z]//g;
67
- $seq{$cur_seq} += length $_;
68
- }
69
- }
70
- close FA;
71
- print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
52
+ SEQ: {
53
+ print STDERR "== Reading reference sequences\n" unless $o{q};
54
+ open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
55
+ my $cur_seq = '';
56
+ while(<FA>){
57
+ chomp;
58
+ if(m/^>(\S+)/){
59
+ my $c = $1;
60
+ $seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
61
+ push @seq, $c;
62
+ $cur_seq = $c;
63
+ }else{
64
+ s/[^A-Za-z]//g;
65
+ $seq{$cur_seq} += length $_;
66
+ }
67
+ }
68
+ close FA;
69
+ print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
72
70
  }
73
71
 
74
72
  open LIM, ">", "$map.lim" or die "Cannot create the file: $map.lim: $!\n";
75
73
  my $l = 0;
76
74
  for my $s (@seq){
77
- print LIM "$s\t".(++$l)."\t$seq{$s}\n";
78
- ($l, $seq{$s}) = ($seq{$s}, $l);
75
+ print LIM "$s\t".(++$l)."\t$seq{$s}\n";
76
+ ($l, $seq{$s}) = ($seq{$s}, $l);
79
77
  }
80
78
  close LIM;
81
79
 
82
- MAP:{
83
- print STDERR "== Reading mapping\n" unless $o{q};
84
- open BLS, "<", $map or die "Cannot read the file: $map: $!\n";
85
- open REC, ">", "$map.rec" or die "Cannot create the file: $map.rec: $!\n";
86
- RESULT:while(<BLS>){
87
- chomp;
88
- my @ln = split /\t/;
89
- $ln[11] or die "Cannot parse line $map:$.: $_\n";
90
- next unless $ln[3]>=$o{l};
91
- next unless $ln[2]>=$o{i};
92
- unless(exists $seq{$ln[1]}){
93
- die "Cannot find the subject sequence: $ln[1]\n" unless $o{s};
94
- next RESULT;
95
- }
96
- my $start = $seq{$ln[1]}+min($ln[8], $ln[9]);
97
- my $end = $seq{$ln[1]}+max($ln[8], $ln[9]);
98
- print REC "$start\t$end\t$ln[2]\t$ln[11]\t$ln[0]",
99
- (exists($ln[13])?"\t".($ln[2]*$ln[3]/min($ln[12],$ln[13]))."\t":
100
- exists($ln[12])?"\t".($ln[2]*$ln[3]/$ln[12])."\t":""),"\n";
101
- }
102
- close BLS;
103
- close REC;
104
- print STDERR " done.\n" unless $o{q};
80
+ MAP: {
81
+ print STDERR "== Reading mapping\n" unless $o{q};
82
+ open BLS, "<", $map or die "Cannot read the file: $map: $!\n";
83
+ open REC, ">", "$map.rec" or die "Cannot create the file: $map.rec: $!\n";
84
+ RESULT: while(<BLS>){
85
+ chomp;
86
+ my @ln = split /\t/;
87
+ $ln[11] or die "Cannot parse line $map:$.: $_\n";
88
+ next unless $ln[3]>=$o{l};
89
+ next unless $ln[2]>=$o{i};
90
+ unless(exists $seq{$ln[1]}){
91
+ die "Cannot find the subject sequence: $ln[1]\n" unless $o{s};
92
+ next RESULT;
93
+ }
94
+ my $start = $seq{$ln[1]}+min($ln[8], $ln[9]);
95
+ my $end = $seq{$ln[1]}+max($ln[8], $ln[9]);
96
+ print REC "$start\t$end\t$ln[2]\t$ln[11]\t$ln[0]",
97
+ (exists($ln[13])?"\t".($ln[2]*$ln[3]/min($ln[12],$ln[13]))."\t":
98
+ exists($ln[12])?"\t".($ln[2]*$ln[3]/$ln[12])."\t":""),"\n";
99
+ }
100
+ close BLS;
101
+ close REC;
102
+ print STDERR " done.\n" unless $o{q};
105
103
  }
106
104
 
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env Rscript
2
2
 
3
3
  # @author Luis M. Rodriguez-R
4
- # @license artistic license 2.0
4
+ # @license Artistic-2.0
5
5
 
6
6
  #= Load stuff
7
7
  suppressPackageStartupMessages(library(enveomics.R))
@@ -17,6 +17,7 @@ opt <- enve.cliopts(enve.recplot2,
17
17
  usage="usage: %prog [options] output.Rdata [output.pdf [width height]]",
18
18
  mandatory=c("prefix"),
19
19
  o_desc=list(pos.breaks="Breaks in the positions histogram.",
20
+ pos.breaks.tsv="File with (absolute) coordinates of breaks in the position histogram",
20
21
  id.breaks="Breaks in the identity histogram.",
21
22
  id.summary="Function summarizing the identity bins. By default: sum.",
22
23
  peaks.col="Color of peaks, mandatory for peak-finding (e.g., darkred).",
@@ -24,7 +25,8 @@ opt <- enve.cliopts(enve.recplot2,
24
25
  p_desc=paste("","Produce recruitment plot objects provided that",
25
26
  "BlastTab.catsbj.pl has been previously executed.", sep="\n\t"),
26
27
  ignore=c("plot"),
27
- defaults=c(id.metric="identity", peaks.col=NA, peaks.method="emauto"))
28
+ defaults=c(pos.breaks.tsv=NA, id.metric="identity", peaks.col=NA,
29
+ peaks.method="emauto"))
28
30
 
29
31
  #= Run it!
30
32
  if(length(opt$args)>1){
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @license Artistic-2.0
6
+ #
7
+
8
+ require 'optparse'
9
+
10
+ o = {q: false}
11
+ ARGV << '-h' if ARGV.size==0
12
+
13
+ OptionParser.new do |opt|
14
+ opt.banner = "
15
+ Extracts a list of sequences and/or coordinates from multi-FastA files.
16
+
17
+ Usage: #{$0} [options]"
18
+ opt.separator ''
19
+ opt.separator 'Mandatory'
20
+ opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
21
+ opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
22
+ opt.on('-c', '--coords STRING',
23
+ 'Comma-delimited list of coordinates (mandatory unless -C is passed).',
24
+ 'The format of the coordinates is "SEQ:FROM..TO" or "SEQ:FROM~LEN":',
25
+ 'SEQ: Sequence ID, or * (asterisk) to extract range from all sequences',
26
+ 'FROM: Integer, position of the first base to include (can be negative)',
27
+ 'TO: Integer, last base to include (can be negative)',
28
+ 'LEN: Length of the range to extract'
29
+ ){ |v| o[:c] = v }
30
+ opt.separator ''
31
+ opt.separator 'Options'
32
+ opt.on('-C', '--coords-file PATH',
33
+ 'File containing the coordinates, one per line.',
34
+ 'Each line must follow the format described for -c.'){ |v| o[:C] = v }
35
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
36
+ opt.on('-h', '--help', 'Display this screen.') do
37
+ puts opt
38
+ exit
39
+ end
40
+ opt.separator ''
41
+ end.parse!
42
+ abort '-i is mandatory.' if o[:i].nil?
43
+ abort '-o is mandatory.' if o[:o].nil?
44
+ abort '-c is mandatory.' if o[:c].nil? and o[:C].nil?
45
+
46
+ # Classses to parse coordinates
47
+ class SeqCoords
48
+ attr :id, :from, :to, :length, :str
49
+ def initialize(str)
50
+ @str = str
51
+ m = /(\S+):(-?\d+)(~|\.\.)(-?\d+)/.match str
52
+ raise "Cannot parse coordinates: #{str}" if m.nil?
53
+ @id = m[1]
54
+ @from = m[2].to_i
55
+ if m[3] == '~'
56
+ @length = m[4].to_i
57
+ else
58
+ @to = m[4].to_i
59
+ end
60
+ end
61
+
62
+ def extract(id, seq)
63
+ return nil unless concerns? id
64
+ from_i = from > 0 ? from : seq.length + 1 + from
65
+ if to.nil?
66
+ seq[from_i, length]
67
+ else
68
+ to_i = to > 0 ? to : seq.length + 1 + to
69
+ seq[from_i .. to_i]
70
+ end
71
+ end
72
+
73
+ def concerns?(seq_id)
74
+ return true if id == '*'
75
+ return id == seq_id
76
+ end
77
+ end
78
+
79
+ class SeqCoordsCollection
80
+ class << self
81
+ def from_str(str)
82
+ c = new
83
+ str.split(',').each { |i| c << SeqCoords.new(i) }
84
+ c
85
+ end
86
+ def from_file(path)
87
+ c = new
88
+ File.open(path, 'r') do |fh|
89
+ fh.each{ |i| c << SeqCoords.new(i.chomp) }
90
+ end
91
+ c
92
+ end
93
+ end
94
+
95
+ attr :collection
96
+
97
+ def initialize
98
+ @collection = []
99
+ end
100
+
101
+ def <<(coords)
102
+ @collection << coords
103
+ end
104
+
105
+ def extract(id, seq)
106
+ @collection.map{ |c| c.extract(id, seq) }.compact
107
+ end
108
+ end
109
+
110
+ # Functions to parse sequences
111
+ def do_stuff(id, sq)
112
+ return if id.nil? or sq.empty?
113
+ @n_in += 1
114
+ sq.gsub!(/[^A-Za-z]/, '')
115
+ i = 0
116
+ @coll.extract(id, sq).each do |new_sq|
117
+ @ofh.puts ">#{id}:#{i += 1}"
118
+ @ofh.puts new_sq
119
+ @n_out += 1
120
+ end
121
+ end
122
+
123
+ # Parse coordinates
124
+ $stderr.puts 'Parsing coordinates' unless o[:q]
125
+ @coll = o[:c].nil? ? SeqCoordsCollection.from_file(o[:C]) :
126
+ SeqCoordsCollection.from_str(o[:c])
127
+ $stderr.puts " Coordinates found: #{@coll.collection.size}"
128
+
129
+ # Parse sequences
130
+ $stderr.puts 'Parsing sequences' unless o[:q]
131
+ @n_in = 0
132
+ @n_out = 0
133
+ @ofh = File.open(o[:o], 'w')
134
+ File.open(o[:i], 'r') do |fh|
135
+ id = nil
136
+ sq = ''
137
+ fh.each do |ln|
138
+ next if ln =~ /^;/
139
+ if ln =~ /^>(\S+)/
140
+ id = $1
141
+ do_stuff(id, sq)
142
+ sq = ''
143
+ else
144
+ sq << ln
145
+ end
146
+ end
147
+ do_stuff(id, sq)
148
+ end
149
+ @ofh.close
150
+ $stderr.puts " Input sequences: #{@n_in}"
151
+ $stderr.puts " Output fragments: #{@n_out}"
152
+
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ o = {x: 'N', trim: false, wrap: 70}
5
+ ARGV << '-h' if ARGV.empty?
6
+ OptionParser.new do |opts|
7
+ opts.banner = "
8
+ Mask sequence region(s) in a FastA file.
9
+
10
+ Usage: #{$0} [options]"
11
+ opts.separator ''
12
+ opts.separator 'Mandatory'
13
+ opts.on('-i', '--in FILE', 'Input FastA file.'){ |v| o[:in] = v }
14
+ opts.on('-o', '--out FILE', 'Output FastA file.'){ |v| o[:out] = v }
15
+ opts.on('-r', '--regions REG1,REG2,...', Array,
16
+ 'Regions to mask separated by commas.',
17
+ 'Each region must be in the format "sequence_id:from..to"'
18
+ ){ |v| o[:reg] = v }
19
+ opts.separator ''
20
+ opts.separator 'Options'
21
+ opts.on('-x', '--symbol CHAR',
22
+ 'Character used to mask the region(s)',
23
+ "By default: #{o[:x]}."){ |v| o[:x] = v }
24
+ opts.on('-t', '--trim',
25
+ 'Trim masked regions extending to the edge of a sequence'
26
+ ){ |v| o[:trim] = v }
27
+ opts.on('-w', '--wrap INT',
28
+ 'Line length to wrap sequences. Use 0 to generate 1-line sequences.',
29
+ "By default: #{o[:wrap]}."){ |v| o[:wrap] = v.to_i }
30
+ opts.on('-h', '--help', 'Display this screen.') do
31
+ puts opts
32
+ exit
33
+ end
34
+ opts.separator ''
35
+ end.parse!
36
+ abort '-i is mandatory' if o[:in].nil?
37
+ abort '-o is mandatory' if o[:out].nil?
38
+ abort '-r is mandatory' if o[:reg].nil?
39
+
40
+ def wrap_width(txt, len)
41
+ return "" if txt.empty?
42
+ return "#{txt}\n" if len==0
43
+ txt.gsub(/(.{1,#{len}})/,"\\1\n")
44
+ end
45
+
46
+ # Read input sequences
47
+ sq = {}
48
+ File.open(o[:in], 'r') do |ifh|
49
+ bf = ''
50
+ ifh.each('>') do |i|
51
+ (dln, seq) = i.split(/[\n\r]+/, 2)
52
+ next if seq.nil?
53
+ id = dln.gsub(/\s.*/, '')
54
+ seq.gsub!(/[\s>]/, '')
55
+ sq[id] = [dln, seq]
56
+ end
57
+ end
58
+
59
+ # Parse coordinates and mask regions
60
+ last_id = nil
61
+ o[:reg].each do |i|
62
+ m = i.match(/^(?:(.+):)?(\d+)\.\.(\d+)$/) or
63
+ abort "Unexpected region format: #{i}"
64
+ r = [m[1], m[2].to_i-1, m[3].to_i-1]
65
+ if r[0].nil?
66
+ abort "Region missing sequence ID: #{i}" if last_id.nil?
67
+ r[0] = last_id
68
+ end
69
+ last_id = r[0]
70
+ sq[r[0]] or abort "Cannot find sequence #{r[0]}"
71
+ r[1] <= r[2] or abort "Malformed range: #{i}"
72
+ if r[1] < 0 or r[2] > sq[r[0]][1].size
73
+ abort "Range extends beyond the edge of the sequence: #{i}"
74
+ end
75
+ sq[r[0]][1][r[1] .. r[2]] = o[:x]*(1+r[2]-r[1])
76
+ end
77
+
78
+ # Trim sequences and generate output
79
+ ofh = File.open(o[:out], 'w')
80
+ sq.each do |_k,v|
81
+ ofh.puts ">#{v[0]}"
82
+ if o[:trim]
83
+ v[1].gsub!(/^#{o[:x]}+/,'')
84
+ v[1].gsub!(/#{o[:x]}+$/,'')
85
+ end
86
+ ofh.print wrap_width(v[1], o[:wrap])
87
+ end
88
+ ofh.close
89
+
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @license Artistic-2.0
6
+ #
7
+
8
+ require 'optparse'
9
+
10
+ o = {q: false, rep: false}
11
+ ARGV << '-h' if ARGV.size==0
12
+
13
+ OptionParser.new do |opt|
14
+ opt.banner = "
15
+ Samples a random set of sequences from a multi-FastA file.
16
+
17
+ Usage: #{$0} [options]"
18
+ opt.separator ''
19
+ opt.separator 'Mandatory'
20
+ opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
21
+ opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
22
+ opt.on('-f', '--fraction FLOAT',
23
+ 'Fraction of sequences to sample [0-1].',
24
+ 'Mandatory unless -n is provided.'){ |v| o[:f] = v.to_f }
25
+ opt.separator ''
26
+ opt.separator 'Options'
27
+ opt.on('-c', '--number INT',
28
+ 'Number of sequences to sample.',
29
+ 'Mandatory unless -f is provided.'){ |v| o[:n] = v.to_i }
30
+ opt.on('-r', '--replacement','Sample with replacement'){ |v| o[:rep] = v }
31
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
32
+ opt.on('-h', '--help', 'Display this screen.') do
33
+ puts opt
34
+ exit
35
+ end
36
+ opt.separator ''
37
+ end.parse!
38
+ abort '-i is mandatory.' if o[:i].nil?
39
+ abort '-o is mandatory.' if o[:o].nil?
40
+ abort '-f or -n is mandatory.' if o[:f].nil? and o[:n].nil?
41
+
42
+ # Functions to parse sequences
43
+ def do_stuff(id, sq)
44
+ return if id.nil? or sq.empty?
45
+ @n_in += 1
46
+ sq.gsub!(/[^A-Za-z]/, '')
47
+ i = 0
48
+ @coll.extract(id, sq).each do |new_sq|
49
+ @ofh.puts ">#{id}:#{i += 1}"
50
+ @ofh.puts new_sq
51
+ @n_out += 1
52
+ end
53
+ end
54
+
55
+ # Parse sequences
56
+ $stderr.puts 'Parsing sequences' unless o[:q]
57
+ seq = []
58
+ File.open(o[:i], 'r') do |fh|
59
+ id = nil
60
+ sq = ''
61
+ fh.each do |ln|
62
+ next if ln =~ /^;/
63
+ if ln =~ /^>(.+)/
64
+ seq << [id, sq] unless id.nil?
65
+ id = $1
66
+ sq = ''
67
+ else
68
+ sq << ln
69
+ end
70
+ end
71
+ seq << [id, sq] unless id.nil?
72
+ end
73
+ $stderr.puts " Input sequences: #{seq.size}"
74
+ o[:n] ||= (seq.size * o[:f]).round
75
+ seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
76
+ File.open(o[:o], 'w') do |fh|
77
+ seq_o.each do |i|
78
+ fh.puts ">#{i[0]}"
79
+ fh.puts i[1]
80
+ end
81
+ end
82
+ $stderr.puts " Output sequences: #{seq_o.size}"
83
+