miga-base 0.3.9.0 → 0.3.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/actions/add.rb +33 -33
  3. data/actions/edit.rb +33 -0
  4. data/actions/new.rb +17 -18
  5. data/actions/next_step.rb +33 -0
  6. data/actions/run.rb +15 -12
  7. data/bin/miga +43 -37
  8. data/lib/miga/daemon.rb +2 -2
  9. data/lib/miga/project/result.rb +16 -1
  10. data/lib/miga/version.rb +2 -2
  11. data/scripts/aai_distances.bash +1 -3
  12. data/scripts/ani_distances.bash +1 -3
  13. data/scripts/assembly.bash +1 -3
  14. data/scripts/cds.bash +1 -3
  15. data/scripts/clade_finding.bash +1 -3
  16. data/scripts/d.bash +13 -0
  17. data/scripts/distances.bash +1 -3
  18. data/scripts/essential_genes.bash +1 -3
  19. data/scripts/haai_distances.bash +1 -3
  20. data/scripts/miga.bash +12 -9
  21. data/scripts/mytaxa.bash +1 -3
  22. data/scripts/mytaxa_scan.bash +1 -3
  23. data/scripts/ogs.bash +36 -33
  24. data/scripts/p.bash +23 -0
  25. data/scripts/project_stats.bash +1 -3
  26. data/scripts/read_quality.bash +1 -3
  27. data/scripts/ssu.bash +1 -3
  28. data/scripts/stats.bash +1 -3
  29. data/scripts/subclades.bash +1 -3
  30. data/scripts/taxonomy.bash +1 -3
  31. data/scripts/trimmed_fasta.bash +1 -3
  32. data/scripts/trimmed_reads.bash +1 -3
  33. data/test/daemon_test.rb +3 -3
  34. data/utils/distance/runner.rb +1 -1
  35. data/utils/enveomics/Docs/recplot2.md +13 -2
  36. data/utils/enveomics/Examples/aai-matrix.bash +3 -3
  37. data/utils/enveomics/Examples/ani-matrix.bash +3 -3
  38. data/utils/enveomics/Makefile +2 -2
  39. data/utils/enveomics/Manifest/Tasks/blasttab.json +12 -4
  40. data/utils/enveomics/Manifest/Tasks/fasta.json +135 -0
  41. data/utils/enveomics/Manifest/Tasks/other.json +49 -0
  42. data/utils/enveomics/Manifest/categories.json +4 -0
  43. data/utils/enveomics/Manifest/examples.json +1 -1
  44. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +1 -1
  45. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +1 -1
  46. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +1 -1
  47. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +1 -1
  48. data/utils/enveomics/Scripts/BlastTab.catsbj.pl +63 -65
  49. data/utils/enveomics/Scripts/BlastTab.recplot2.R +4 -2
  50. data/utils/enveomics/Scripts/FastA.extract.rb +152 -0
  51. data/utils/enveomics/Scripts/FastA.mask.rb +89 -0
  52. data/utils/enveomics/Scripts/FastA.sample.rb +83 -0
  53. data/utils/enveomics/Scripts/GFF.catsbj.pl +127 -0
  54. data/utils/enveomics/Scripts/aai.rb +4 -3
  55. data/utils/enveomics/Scripts/lib/enveomics.R +1 -1
  56. data/utils/enveomics/enveomics.R/DESCRIPTION +1 -2
  57. data/utils/enveomics/enveomics.R/NAMESPACE +3 -3
  58. data/utils/enveomics/enveomics.R/R/recplot.R +2 -3
  59. data/utils/enveomics/enveomics.R/R/recplot2.R +221 -160
  60. data/utils/enveomics/enveomics.R/R/utils.R +19 -1
  61. data/utils/enveomics/enveomics.R/README.md +11 -0
  62. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +2 -2
  63. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +1 -0
  64. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -5
  65. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +10 -8
  66. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +1 -1
  67. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +26 -0
  68. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +22 -0
  69. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +13 -7
  70. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +3 -4
  71. data/utils/subclade/runner.rb +4 -0
  72. metadata +14 -3
@@ -41,14 +41,17 @@
41
41
  "FastQ.test-error.rb"
42
42
  ],
43
43
  "Manipulation": [
44
+ "FastA.extract.rb",
44
45
  "FastA.filter.pl",
45
46
  "FastA.filterLen.pl",
46
47
  "FastA.filterN.pl",
47
48
  "FastA.fragment.rb",
48
49
  "FastA.interpose.pl",
50
+ "FastA.mask.rb",
49
51
  "FastA.per_file.pl",
50
52
  "FastA.rename.pl",
51
53
  "FastA.revcom.pl",
54
+ "FastA.sample.rb",
52
55
  "FastA.slider.pl",
53
56
  "FastA.split.pl",
54
57
  "FastA.split.rb",
@@ -143,6 +146,7 @@
143
146
  "BlastTab.catsbj.pl",
144
147
  "BlastTab.pairedHits.rb",
145
148
  "BlastTab.recplot2.R",
149
+ "GFF.catsbj.pl",
146
150
  "RecPlot2.compareIdentities.R"
147
151
  ]
148
152
  }
@@ -57,7 +57,7 @@
57
57
  "description": ["Generates recruitment plots for a comparison",
58
58
  "between a virome containing HIV and the HIV-1 genome."],
59
59
  "values": ["hiv_mix-hiv1.blast.tsv",50,100,null,null,null,null,null,null,
60
- null,null,"hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
60
+ null,null,null,"hiv_mix-hiv1.Rdata","hiv_mix-hiv1.pdf",null,null]
61
61
  },
62
62
  {
63
63
  "_": "== Examples of functional annotations ==",
@@ -1 +1 @@
1
- ../../Scripts/FastA.N50.pl
1
+ utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.N50.pl
@@ -1 +1 @@
1
- ../../Scripts/FastA.filterN.pl
1
+ utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.filterN.pl
@@ -1 +1 @@
1
- ../../Scripts/FastA.length.pl
1
+ utils/enveomics/Pipelines/assembly.pbs/../../Scripts/FastA.length.pl
@@ -1 +1 @@
1
- ../../Scripts/FastA.split.pl
1
+ utils/enveomics/Pipelines/blast.pbs/../../Scripts/FastA.split.pl
@@ -1,9 +1,7 @@
1
1
  #!/usr/bin/env perl
2
- #
3
- # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
- # @updated: Mar-23-2015
5
- # @license: artistic license 2.0
6
- #
2
+
3
+ # @author: Luis M. Rodriguez-R
4
+ # @license: Artistic-2.0
7
5
 
8
6
  use warnings;
9
7
  use strict;
@@ -13,29 +11,29 @@ use Getopt::Std;
13
11
  sub HELP_MESSAGE { die "
14
12
 
15
13
  Description:
16
- Generates a list of hits from a BLAST result concatenating the subject
17
- sequences. This can be used, e.g., to analyze BLAST results against
18
- draft genomes.
14
+ Generates a list of hits from a BLAST result concatenating the subject
15
+ sequences. This can be used, e.g., to analyze BLAST results against
16
+ draft genomes.
19
17
 
20
18
  Usage:
21
- $0 [options] seq.fa map.bls
19
+ $0 [options] seq.fa map.bls
22
20
 
23
- seq.fa Subject sequences (ref) in FastA format.
24
- map.bls Mapping of the reads to the reference in BLAST Tabular
21
+ seq.fa Subject sequences (ref) in FastA format.
22
+ map.bls Mapping of the reads to the reference in BLAST Tabular
25
23
  format.
26
24
 
27
- Options:
28
- -i <float> Minimum identity to report a result. By default: 70.
29
- -l <int> Minimum alignment length to report a result. By default: 60.
30
- -s The FastA provided is to be treated as a subset of the subject.
31
- By default, it expects all the subjects to be present in the
32
- BLAST.
33
- -q Run quietly.
34
- -h Display this message and exit.
25
+ Options:
26
+ -i <float> Minimum identity to report a result. By default: 70.
27
+ -l <int> Minimum alignment length to report a result. By default: 60.
28
+ -s The FastA provided is to be treated as a subset of the subject.
29
+ By default, it expects all the BLAST subjects to be present in
30
+ the FastA.
31
+ -q Run quietly.
32
+ -h Display this message and exit.
35
33
 
36
- This script creates two files using <map.bls> as prefix with extensions
37
- .rec (for the recruitment plot) and .lim (for the limits of the different
38
- sequences in <seq.fa>).
34
+ This script creates two files using <map.bls> as prefix with extensions
35
+ .rec (for the recruitment plot) and .lim (for the limits of the different
36
+ sequences in <seq.fa>).
39
37
 
40
38
  ";}
41
39
 
@@ -51,56 +49,56 @@ my %seq = ();
51
49
  my @seq = ();
52
50
  my $tot = 0;
53
51
 
54
- SEQ:{
55
- print STDERR "== Reading reference sequences\n" unless $o{q};
56
- open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
57
- my $cur_seq = '';
58
- while(<FA>){
59
- chomp;
60
- if(m/^>(\S+)/){
61
- my $c = $1;
62
- $seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
63
- push @seq, $c;
64
- $cur_seq = $c;
65
- }else{
66
- s/[^A-Za-z]//g;
67
- $seq{$cur_seq} += length $_;
68
- }
69
- }
70
- close FA;
71
- print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
52
+ SEQ: {
53
+ print STDERR "== Reading reference sequences\n" unless $o{q};
54
+ open FA, "<", $fa or die "Cannot read the file: $fa: $!\n";
55
+ my $cur_seq = '';
56
+ while(<FA>){
57
+ chomp;
58
+ if(m/^>(\S+)/){
59
+ my $c = $1;
60
+ $seq{$c} = exists $seq{$cur_seq} ? $seq{$cur_seq}+1 : 1;
61
+ push @seq, $c;
62
+ $cur_seq = $c;
63
+ }else{
64
+ s/[^A-Za-z]//g;
65
+ $seq{$cur_seq} += length $_;
66
+ }
67
+ }
68
+ close FA;
69
+ print STDERR " Found ".(scalar @seq)." sequences.\n" unless $o{q};
72
70
  }
73
71
 
74
72
  open LIM, ">", "$map.lim" or die "Cannot create the file: $map.lim: $!\n";
75
73
  my $l = 0;
76
74
  for my $s (@seq){
77
- print LIM "$s\t".(++$l)."\t$seq{$s}\n";
78
- ($l, $seq{$s}) = ($seq{$s}, $l);
75
+ print LIM "$s\t".(++$l)."\t$seq{$s}\n";
76
+ ($l, $seq{$s}) = ($seq{$s}, $l);
79
77
  }
80
78
  close LIM;
81
79
 
82
- MAP:{
83
- print STDERR "== Reading mapping\n" unless $o{q};
84
- open BLS, "<", $map or die "Cannot read the file: $map: $!\n";
85
- open REC, ">", "$map.rec" or die "Cannot create the file: $map.rec: $!\n";
86
- RESULT:while(<BLS>){
87
- chomp;
88
- my @ln = split /\t/;
89
- $ln[11] or die "Cannot parse line $map:$.: $_\n";
90
- next unless $ln[3]>=$o{l};
91
- next unless $ln[2]>=$o{i};
92
- unless(exists $seq{$ln[1]}){
93
- die "Cannot find the subject sequence: $ln[1]\n" unless $o{s};
94
- next RESULT;
95
- }
96
- my $start = $seq{$ln[1]}+min($ln[8], $ln[9]);
97
- my $end = $seq{$ln[1]}+max($ln[8], $ln[9]);
98
- print REC "$start\t$end\t$ln[2]\t$ln[11]\t$ln[0]",
99
- (exists($ln[13])?"\t".($ln[2]*$ln[3]/min($ln[12],$ln[13]))."\t":
100
- exists($ln[12])?"\t".($ln[2]*$ln[3]/$ln[12])."\t":""),"\n";
101
- }
102
- close BLS;
103
- close REC;
104
- print STDERR " done.\n" unless $o{q};
80
+ MAP: {
81
+ print STDERR "== Reading mapping\n" unless $o{q};
82
+ open BLS, "<", $map or die "Cannot read the file: $map: $!\n";
83
+ open REC, ">", "$map.rec" or die "Cannot create the file: $map.rec: $!\n";
84
+ RESULT: while(<BLS>){
85
+ chomp;
86
+ my @ln = split /\t/;
87
+ $ln[11] or die "Cannot parse line $map:$.: $_\n";
88
+ next unless $ln[3]>=$o{l};
89
+ next unless $ln[2]>=$o{i};
90
+ unless(exists $seq{$ln[1]}){
91
+ die "Cannot find the subject sequence: $ln[1]\n" unless $o{s};
92
+ next RESULT;
93
+ }
94
+ my $start = $seq{$ln[1]}+min($ln[8], $ln[9]);
95
+ my $end = $seq{$ln[1]}+max($ln[8], $ln[9]);
96
+ print REC "$start\t$end\t$ln[2]\t$ln[11]\t$ln[0]",
97
+ (exists($ln[13])?"\t".($ln[2]*$ln[3]/min($ln[12],$ln[13]))."\t":
98
+ exists($ln[12])?"\t".($ln[2]*$ln[3]/$ln[12])."\t":""),"\n";
99
+ }
100
+ close BLS;
101
+ close REC;
102
+ print STDERR " done.\n" unless $o{q};
105
103
  }
106
104
 
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env Rscript
2
2
 
3
3
  # @author Luis M. Rodriguez-R
4
- # @license artistic license 2.0
4
+ # @license Artistic-2.0
5
5
 
6
6
  #= Load stuff
7
7
  suppressPackageStartupMessages(library(enveomics.R))
@@ -17,6 +17,7 @@ opt <- enve.cliopts(enve.recplot2,
17
17
  usage="usage: %prog [options] output.Rdata [output.pdf [width height]]",
18
18
  mandatory=c("prefix"),
19
19
  o_desc=list(pos.breaks="Breaks in the positions histogram.",
20
+ pos.breaks.tsv="File with (absolute) coordinates of breaks in the position histogram",
20
21
  id.breaks="Breaks in the identity histogram.",
21
22
  id.summary="Function summarizing the identity bins. By default: sum.",
22
23
  peaks.col="Color of peaks, mandatory for peak-finding (e.g., darkred).",
@@ -24,7 +25,8 @@ opt <- enve.cliopts(enve.recplot2,
24
25
  p_desc=paste("","Produce recruitment plot objects provided that",
25
26
  "BlastTab.catsbj.pl has been previously executed.", sep="\n\t"),
26
27
  ignore=c("plot"),
27
- defaults=c(id.metric="identity", peaks.col=NA, peaks.method="emauto"))
28
+ defaults=c(pos.breaks.tsv=NA, id.metric="identity", peaks.col=NA,
29
+ peaks.method="emauto"))
28
30
 
29
31
  #= Run it!
30
32
  if(length(opt$args)>1){
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @license Artistic-2.0
6
+ #
7
+
8
+ require 'optparse'
9
+
10
+ o = {q: false}
11
+ ARGV << '-h' if ARGV.size==0
12
+
13
+ OptionParser.new do |opt|
14
+ opt.banner = "
15
+ Extracts a list of sequences and/or coordinates from multi-FastA files.
16
+
17
+ Usage: #{$0} [options]"
18
+ opt.separator ''
19
+ opt.separator 'Mandatory'
20
+ opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
21
+ opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
22
+ opt.on('-c', '--coords STRING',
23
+ 'Comma-delimited list of coordinates (mandatory unless -C is passed).',
24
+ 'The format of the coordinates is "SEQ:FROM..TO" or "SEQ:FROM~LEN":',
25
+ 'SEQ: Sequence ID, or * (asterisk) to extract range from all sequences',
26
+ 'FROM: Integer, position of the first base to include (can be negative)',
27
+ 'TO: Integer, last base to include (can be negative)',
28
+ 'LEN: Length of the range to extract'
29
+ ){ |v| o[:c] = v }
30
+ opt.separator ''
31
+ opt.separator 'Options'
32
+ opt.on('-C', '--coords-file PATH',
33
+ 'File containing the coordinates, one per line.',
34
+ 'Each line must follow the format described for -c.'){ |v| o[:C] = v }
35
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
36
+ opt.on('-h', '--help', 'Display this screen.') do
37
+ puts opt
38
+ exit
39
+ end
40
+ opt.separator ''
41
+ end.parse!
42
+ abort '-i is mandatory.' if o[:i].nil?
43
+ abort '-o is mandatory.' if o[:o].nil?
44
+ abort '-c is mandatory.' if o[:c].nil? and o[:C].nil?
45
+
46
+ # Classses to parse coordinates
47
+ class SeqCoords
48
+ attr :id, :from, :to, :length, :str
49
+ def initialize(str)
50
+ @str = str
51
+ m = /(\S+):(-?\d+)(~|\.\.)(-?\d+)/.match str
52
+ raise "Cannot parse coordinates: #{str}" if m.nil?
53
+ @id = m[1]
54
+ @from = m[2].to_i
55
+ if m[3] == '~'
56
+ @length = m[4].to_i
57
+ else
58
+ @to = m[4].to_i
59
+ end
60
+ end
61
+
62
+ def extract(id, seq)
63
+ return nil unless concerns? id
64
+ from_i = from > 0 ? from : seq.length + 1 + from
65
+ if to.nil?
66
+ seq[from_i, length]
67
+ else
68
+ to_i = to > 0 ? to : seq.length + 1 + to
69
+ seq[from_i .. to_i]
70
+ end
71
+ end
72
+
73
+ def concerns?(seq_id)
74
+ return true if id == '*'
75
+ return id == seq_id
76
+ end
77
+ end
78
+
79
+ class SeqCoordsCollection
80
+ class << self
81
+ def from_str(str)
82
+ c = new
83
+ str.split(',').each { |i| c << SeqCoords.new(i) }
84
+ c
85
+ end
86
+ def from_file(path)
87
+ c = new
88
+ File.open(path, 'r') do |fh|
89
+ fh.each{ |i| c << SeqCoords.new(i.chomp) }
90
+ end
91
+ c
92
+ end
93
+ end
94
+
95
+ attr :collection
96
+
97
+ def initialize
98
+ @collection = []
99
+ end
100
+
101
+ def <<(coords)
102
+ @collection << coords
103
+ end
104
+
105
+ def extract(id, seq)
106
+ @collection.map{ |c| c.extract(id, seq) }.compact
107
+ end
108
+ end
109
+
110
+ # Functions to parse sequences
111
+ def do_stuff(id, sq)
112
+ return if id.nil? or sq.empty?
113
+ @n_in += 1
114
+ sq.gsub!(/[^A-Za-z]/, '')
115
+ i = 0
116
+ @coll.extract(id, sq).each do |new_sq|
117
+ @ofh.puts ">#{id}:#{i += 1}"
118
+ @ofh.puts new_sq
119
+ @n_out += 1
120
+ end
121
+ end
122
+
123
+ # Parse coordinates
124
+ $stderr.puts 'Parsing coordinates' unless o[:q]
125
+ @coll = o[:c].nil? ? SeqCoordsCollection.from_file(o[:C]) :
126
+ SeqCoordsCollection.from_str(o[:c])
127
+ $stderr.puts " Coordinates found: #{@coll.collection.size}"
128
+
129
+ # Parse sequences
130
+ $stderr.puts 'Parsing sequences' unless o[:q]
131
+ @n_in = 0
132
+ @n_out = 0
133
+ @ofh = File.open(o[:o], 'w')
134
+ File.open(o[:i], 'r') do |fh|
135
+ id = nil
136
+ sq = ''
137
+ fh.each do |ln|
138
+ next if ln =~ /^;/
139
+ if ln =~ /^>(\S+)/
140
+ id = $1
141
+ do_stuff(id, sq)
142
+ sq = ''
143
+ else
144
+ sq << ln
145
+ end
146
+ end
147
+ do_stuff(id, sq)
148
+ end
149
+ @ofh.close
150
+ $stderr.puts " Input sequences: #{@n_in}"
151
+ $stderr.puts " Output fragments: #{@n_out}"
152
+
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ o = {x: 'N', trim: false, wrap: 70}
5
+ ARGV << '-h' if ARGV.empty?
6
+ OptionParser.new do |opts|
7
+ opts.banner = "
8
+ Mask sequence region(s) in a FastA file.
9
+
10
+ Usage: #{$0} [options]"
11
+ opts.separator ''
12
+ opts.separator 'Mandatory'
13
+ opts.on('-i', '--in FILE', 'Input FastA file.'){ |v| o[:in] = v }
14
+ opts.on('-o', '--out FILE', 'Output FastA file.'){ |v| o[:out] = v }
15
+ opts.on('-r', '--regions REG1,REG2,...', Array,
16
+ 'Regions to mask separated by commas.',
17
+ 'Each region must be in the format "sequence_id:from..to"'
18
+ ){ |v| o[:reg] = v }
19
+ opts.separator ''
20
+ opts.separator 'Options'
21
+ opts.on('-x', '--symbol CHAR',
22
+ 'Character used to mask the region(s)',
23
+ "By default: #{o[:x]}."){ |v| o[:x] = v }
24
+ opts.on('-t', '--trim',
25
+ 'Trim masked regions extending to the edge of a sequence'
26
+ ){ |v| o[:trim] = v }
27
+ opts.on('-w', '--wrap INT',
28
+ 'Line length to wrap sequences. Use 0 to generate 1-line sequences.',
29
+ "By default: #{o[:wrap]}."){ |v| o[:wrap] = v.to_i }
30
+ opts.on('-h', '--help', 'Display this screen.') do
31
+ puts opts
32
+ exit
33
+ end
34
+ opts.separator ''
35
+ end.parse!
36
+ abort '-i is mandatory' if o[:in].nil?
37
+ abort '-o is mandatory' if o[:out].nil?
38
+ abort '-r is mandatory' if o[:reg].nil?
39
+
40
+ def wrap_width(txt, len)
41
+ return "" if txt.empty?
42
+ return "#{txt}\n" if len==0
43
+ txt.gsub(/(.{1,#{len}})/,"\\1\n")
44
+ end
45
+
46
+ # Read input sequences
47
+ sq = {}
48
+ File.open(o[:in], 'r') do |ifh|
49
+ bf = ''
50
+ ifh.each('>') do |i|
51
+ (dln, seq) = i.split(/[\n\r]+/, 2)
52
+ next if seq.nil?
53
+ id = dln.gsub(/\s.*/, '')
54
+ seq.gsub!(/[\s>]/, '')
55
+ sq[id] = [dln, seq]
56
+ end
57
+ end
58
+
59
+ # Parse coordinates and mask regions
60
+ last_id = nil
61
+ o[:reg].each do |i|
62
+ m = i.match(/^(?:(.+):)?(\d+)\.\.(\d+)$/) or
63
+ abort "Unexpected region format: #{i}"
64
+ r = [m[1], m[2].to_i-1, m[3].to_i-1]
65
+ if r[0].nil?
66
+ abort "Region missing sequence ID: #{i}" if last_id.nil?
67
+ r[0] = last_id
68
+ end
69
+ last_id = r[0]
70
+ sq[r[0]] or abort "Cannot find sequence #{r[0]}"
71
+ r[1] <= r[2] or abort "Malformed range: #{i}"
72
+ if r[1] < 0 or r[2] > sq[r[0]][1].size
73
+ abort "Range extends beyond the edge of the sequence: #{i}"
74
+ end
75
+ sq[r[0]][1][r[1] .. r[2]] = o[:x]*(1+r[2]-r[1])
76
+ end
77
+
78
+ # Trim sequences and generate output
79
+ ofh = File.open(o[:out], 'w')
80
+ sq.each do |_k,v|
81
+ ofh.puts ">#{v[0]}"
82
+ if o[:trim]
83
+ v[1].gsub!(/^#{o[:x]}+/,'')
84
+ v[1].gsub!(/#{o[:x]}+$/,'')
85
+ end
86
+ ofh.print wrap_width(v[1], o[:wrap])
87
+ end
88
+ ofh.close
89
+
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
+ # @license Artistic-2.0
6
+ #
7
+
8
+ require 'optparse'
9
+
10
+ o = {q: false, rep: false}
11
+ ARGV << '-h' if ARGV.size==0
12
+
13
+ OptionParser.new do |opt|
14
+ opt.banner = "
15
+ Samples a random set of sequences from a multi-FastA file.
16
+
17
+ Usage: #{$0} [options]"
18
+ opt.separator ''
19
+ opt.separator 'Mandatory'
20
+ opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
21
+ opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
22
+ opt.on('-f', '--fraction FLOAT',
23
+ 'Fraction of sequences to sample [0-1].',
24
+ 'Mandatory unless -n is provided.'){ |v| o[:f] = v.to_f }
25
+ opt.separator ''
26
+ opt.separator 'Options'
27
+ opt.on('-c', '--number INT',
28
+ 'Number of sequences to sample.',
29
+ 'Mandatory unless -f is provided.'){ |v| o[:n] = v.to_i }
30
+ opt.on('-r', '--replacement','Sample with replacement'){ |v| o[:rep] = v }
31
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
32
+ opt.on('-h', '--help', 'Display this screen.') do
33
+ puts opt
34
+ exit
35
+ end
36
+ opt.separator ''
37
+ end.parse!
38
+ abort '-i is mandatory.' if o[:i].nil?
39
+ abort '-o is mandatory.' if o[:o].nil?
40
+ abort '-f or -n is mandatory.' if o[:f].nil? and o[:n].nil?
41
+
42
+ # Functions to parse sequences
43
+ def do_stuff(id, sq)
44
+ return if id.nil? or sq.empty?
45
+ @n_in += 1
46
+ sq.gsub!(/[^A-Za-z]/, '')
47
+ i = 0
48
+ @coll.extract(id, sq).each do |new_sq|
49
+ @ofh.puts ">#{id}:#{i += 1}"
50
+ @ofh.puts new_sq
51
+ @n_out += 1
52
+ end
53
+ end
54
+
55
+ # Parse sequences
56
+ $stderr.puts 'Parsing sequences' unless o[:q]
57
+ seq = []
58
+ File.open(o[:i], 'r') do |fh|
59
+ id = nil
60
+ sq = ''
61
+ fh.each do |ln|
62
+ next if ln =~ /^;/
63
+ if ln =~ /^>(.+)/
64
+ seq << [id, sq] unless id.nil?
65
+ id = $1
66
+ sq = ''
67
+ else
68
+ sq << ln
69
+ end
70
+ end
71
+ seq << [id, sq] unless id.nil?
72
+ end
73
+ $stderr.puts " Input sequences: #{seq.size}"
74
+ o[:n] ||= (seq.size * o[:f]).round
75
+ seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
76
+ File.open(o[:o], 'w') do |fh|
77
+ seq_o.each do |i|
78
+ fh.puts ">#{i[0]}"
79
+ fh.puts i[1]
80
+ end
81
+ end
82
+ $stderr.puts " Output sequences: #{seq_o.size}"
83
+