miga-base 0.7.26.2 → 1.0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/env.rb +26 -0
  11. data/lib/miga/cli/action/init.rb +11 -7
  12. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  13. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  14. data/lib/miga/cli/action/tax_dist.rb +2 -2
  15. data/lib/miga/cli/action/wf.rb +5 -4
  16. data/lib/miga/cli/base.rb +1 -0
  17. data/lib/miga/common.rb +1 -0
  18. data/lib/miga/daemon.rb +11 -4
  19. data/lib/miga/dataset/result.rb +10 -6
  20. data/lib/miga/json.rb +5 -4
  21. data/lib/miga/metadata.rb +5 -1
  22. data/lib/miga/parallel.rb +36 -0
  23. data/lib/miga/project.rb +8 -8
  24. data/lib/miga/project/base.rb +4 -4
  25. data/lib/miga/project/result.rb +2 -2
  26. data/lib/miga/sqlite.rb +10 -2
  27. data/lib/miga/version.rb +23 -9
  28. data/scripts/aai_distances.bash +16 -18
  29. data/scripts/ani_distances.bash +16 -17
  30. data/scripts/assembly.bash +31 -16
  31. data/scripts/haai_distances.bash +3 -27
  32. data/scripts/miga.bash +12 -8
  33. data/scripts/p.bash +1 -1
  34. data/scripts/read_quality.bash +9 -18
  35. data/scripts/trimmed_fasta.bash +14 -30
  36. data/scripts/trimmed_reads.bash +36 -36
  37. data/test/parallel_test.rb +31 -0
  38. data/test/project_test.rb +2 -1
  39. data/test/remote_dataset_test.rb +1 -1
  40. data/utils/distance/commands.rb +1 -0
  41. data/utils/distance/database.rb +0 -1
  42. data/utils/distance/runner.rb +2 -4
  43. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  44. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  45. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  46. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  47. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  48. data/utils/enveomics/Manifest/categories.json +13 -4
  49. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  50. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  51. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  52. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  53. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  54. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  55. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  56. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  57. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  58. data/utils/enveomics/Scripts/aai.rb +3 -2
  59. data/utils/enveomics/Scripts/anir.rb +137 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  64. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  65. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  66. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  67. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  68. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  69. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  70. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  71. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  72. data/utils/enveomics/Scripts/rbm.rb +87 -133
  73. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  74. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  75. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  76. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  77. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  78. data/utils/enveomics/enveomics.R/README.md +1 -0
  79. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  80. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  81. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  82. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  83. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  84. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  85. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  86. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  88. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  89. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  90. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  100. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  101. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  102. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  103. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  104. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  105. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  106. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  107. data/utils/multitrim/README.md +67 -0
  108. data/utils/multitrim/multitrim.py +1555 -0
  109. data/utils/multitrim/multitrim.yml +13 -0
  110. data/utils/requirements.txt +4 -3
  111. data/utils/subclade/pipeline.rb +2 -2
  112. metadata +33 -4
  113. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -1,9 +1,8 @@
1
1
  #!/usr/bin/env perl
2
- #
2
+
3
3
  # @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
4
- # @update: Oct 07 2015
5
- # @license: artistic license 2.0
6
- #
4
+ # @license: Artistic-2.0
5
+
7
6
  use strict;
8
7
  use warnings;
9
8
  use List::Util qw/sum min max/;
@@ -11,46 +10,51 @@ use List::Util qw/sum min max/;
11
10
  my ($seqs, $minlen, $n__) = @ARGV;
12
11
  $seqs or die "
13
12
  Description:
14
- Calculates the N50 value of a set of sequences. Alternatively, it
15
- can calculate other N** values. It also calculates the total number
16
- of sequences and the total added length.
17
-
13
+ Calculates the N50 value of a set of sequences. Alternatively, it
14
+ can calculate other N** values. It also calculates the total number
15
+ of sequences, the total added length, and the longest sequence length.
16
+
18
17
  Usage:
19
- $0 seqs.fa[ minlen[ **]]
18
+ $0 seqs.fa [minlen [**]]
19
+
20
+ seqs.fa A FastA file containing the sequences
21
+ minlen (optional) The minimum length to take into consideration
22
+ By default: 0
23
+ ** (optional) Value N** to calculate. By default: 50 (N50)
20
24
 
21
- seqs.fa A FastA file containing the sequences.
22
- minlen (optional) The minimum length to take into consideration.
23
- By default: 0.
24
- ** Value N** to calculate. By default: 50 (N50).
25
25
  ";
26
+
26
27
  $minlen ||= 0;
27
28
  $n__ ||= 50;
28
29
 
29
30
  my @len = ();
30
31
  open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
31
32
  while(<SEQ>){
32
- if(/^>/){
33
- push @len, 0;
34
- }else{
35
- next if /^;/;
36
- chomp;
37
- s/\W//g;
38
- $len[-1]+=length $_;
39
- }
33
+ if(/^>/){
34
+ push @len, 0;
35
+ }else{
36
+ next if /^;/;
37
+ chomp;
38
+ s/\W//g;
39
+ $len[-1] += length $_;
40
+ }
40
41
  }
41
42
  close SEQ;
42
- @len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
43
+
44
+ @len = sort { $a <=> $b } map { $_ >= $minlen ? $_ : () } @len;
43
45
  my $tot = (sum(@len) || 0);
44
46
 
45
- my $thr = $n__*$tot/100;
47
+ my $thr = $n__ * $tot / 100;
46
48
  my $pos = 0;
47
49
  for(@len){
48
- $pos+= $_;
49
- if($pos>=$thr){
50
- print "N$n__: $_\n";
51
- last;
52
- }
50
+ $pos += $_;
51
+ if($pos >= $thr){
52
+ print "N$n__: $_\n";
53
+ last;
54
+ }
53
55
  }
54
- print "Sequences: ".scalar(@len)."\n";
56
+
57
+ print "Sequences: " . scalar(@len) . "\n";
55
58
  print "Total length: $tot\n";
59
+ print "Longest sequence: " . pop(@len) . "\n";
56
60
 
@@ -1,92 +1,100 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- # @author Luis M. Rodriguez-R
4
- # @license artistic license 2.0
3
+ # frozen_string_literal: true
5
4
 
6
- $:.push File.expand_path("../lib", __FILE__)
7
- require "enveomics_rb/enveomics"
8
- require "enveomics_rb/stat"
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ require 'enveomics_rb/stats'
8
+ $VERSION = 1.0
9
9
 
10
- o = {q:false, completeness:nil, minlen:500, shuffle:true}
10
+ o = { q: false, completeness: nil, minlen: 500, shuffle: true }
11
11
  OptionParser.new do |opts|
12
- opts.banner = "
13
- Simulates incomplete (fragmented) drafts from complete genomes.
12
+ opts.version = $VERSION
13
+ Enveomics.opt_banner(
14
+ opts, 'Simulates incomplete (fragmented) drafts from complete genomes',
15
+ "#{File.basename($0)} -i in.fasta -o out.fasta -c 0.5 [options]"
16
+ )
14
17
 
15
- Usage: #{$0} [options]"
16
- opts.separator ""
17
- opts.separator "Mandatory"
18
- opts.on("-i", "--in FILE",
19
- "Path to the FastA file containing the complete sequences."
20
- ){ |v| o[:in] = v }
21
- opts.on("-o", "--out FILE", "Path to the FastA to create."){ |v| o[:out] = v }
22
- opts.on("-c", "--completeness FLOAT",
23
- "Fraction of genome completeness to simulate from 0 to 1."
24
- ){ |v| o[:completeness] = v.to_f }
25
- opts.separator ""
26
- opts.separator "Options"
27
- opts.on("-m", "--minlen INT",
28
- "Minimum fragment length to report. By default: #{o[:minlen]}."
29
- ){ |v| o[:minlen] = v.to_i }
30
- opts.on("-s", "--sorted", "Keep fragments sorted as in the input file. ",
31
- "By default, fragments are shuffled."){ |v| o[:shuffle] = !v }
32
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = true }
33
- opts.on("-h", "--help", "Display this screen") do
34
- puts opts
35
- exit
36
- end
37
- opts.separator ""
18
+ opts.separator 'Mandatory'
19
+ opts.on(
20
+ '-i', '--in FILE',
21
+ 'Path to the FastA file containing the complete sequences',
22
+ 'Supports compression with .gz extension, use - for STDIN'
23
+ ) { |v| o[:in] = v }
24
+ opts.on(
25
+ '-o', '--out FILE', 'Path to the FastA to create',
26
+ 'Supports compression with .gz extension, use - for STDOUT'
27
+ ) { |v| o[:out] = v }
28
+ opts.on(
29
+ '-c', '--completeness FLOAT',
30
+ 'Fraction of genome completeness to simulate from 0 to 1'
31
+ ) { |v| o[:completeness] = v.to_f }
32
+
33
+ opts.separator ''
34
+ opts.separator 'Options'
35
+ opts.on(
36
+ '-m', '--minlen INT',
37
+ "Minimum fragment length to report. By default: #{o[:minlen]}"
38
+ ) { |v| o[:minlen] = v.to_i }
39
+ opts.on(
40
+ '-s', '--sorted', 'Keep fragments sorted as in the input file',
41
+ 'By default, fragments are shuffled'
42
+ ) { |v| o[:shuffle] = !v }
43
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
44
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
45
+ opts.separator ''
38
46
  end.parse!
39
- abort "-i is mandatory" if o[:in].nil?
40
- abort "-o is mandatory" if o[:out].nil?
41
- abort "-c is mandatory" if o[:completeness].nil?
47
+
48
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
49
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
50
+ raise Enveomics::OptionError.new('-c is mandatory') if o[:completeness].nil?
42
51
 
43
52
  begin
44
53
  # Read input sequences
45
54
  g_id = []
46
55
  g_seq = []
47
- File.open(o[:in], "r") do |ifh|
48
- id = ""
49
- ifh.each_line do |ln|
50
- if ln =~ /^>(\S*)/
51
- g_id << $1
52
- g_seq << ""
53
- else
54
- g_seq[g_seq.size-1] += ln.gsub(/[^A-Za-z]/,"")
55
- end
56
+ ifh = reader(o[:in])
57
+ id = ''
58
+ ifh.each_line do |ln|
59
+ if ln =~ /^>(\S*)/
60
+ g_id << $1
61
+ g_seq << ''
62
+ else
63
+ g_seq[g_seq.size - 1] += ln.gsub(/[^A-Za-z]/, '')
56
64
  end
57
65
  end
58
-
66
+ ifh.close
67
+
59
68
  # Fragment genomes
60
69
  f = {}
61
70
  binlen = [1, (o[:minlen].to_f/(1.5**2)).ceil].max
62
71
  p = [0.001, [1.0, 1.0 - (o[:completeness]/1.25 + 0.1)].min].max
63
- while not g_seq.empty?
72
+ while !g_seq.empty?
64
73
  id = g_id.shift
65
74
  seq = g_seq.shift
66
75
  gL = seq.length
67
- while not seq.empty?
68
- fL = [0, ((Enve::Stat.r_geom(p).to_f +
69
- Enve::Stat.r_unif(-0.5,0.5))*binlen).round].max
70
- f["#{f.size+1}_#{id}"] = seq[0,fL] if fL >= o[:minlen]
71
- seq = seq[(fL+1) .. -1]
72
- seq = "" if seq.nil?
76
+ while !seq.empty?
77
+ rand_x =
78
+ Enveomics::Stats.r_geom(p).to_f + Enveomics::Stats.r_unif(-0.5, 0.5)
79
+ fL = [0, (rand_x * binlen).round].max
80
+ f["#{f.size+1}_#{id}"] = seq[0, fL] if fL >= o[:minlen]
81
+ seq = seq[(fL + 1) .. -1]
82
+ seq = '' if seq.nil?
73
83
  end
74
84
  end
75
85
 
76
86
  # Save output
77
87
  k = f.keys
78
88
  k.shuffle! if o[:shuffle]
79
- File.open(o[:out], "w") do |ofh|
80
- k.each do |id|
81
- ofh.puts ">#{id}"
82
- ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
83
- end
89
+ ofh = writer(o[:out])
90
+ k.each do |id|
91
+ ofh.puts ">#{id}"
92
+ ofh.puts f[id].gsub(/(\S{50})/, "\\1\n")
84
93
  end
85
-
94
+ ofh.close
86
95
  rescue => err
87
- $stderr.puts "Exception: #{err}\n\n"
88
- err.backtrace.each { |l| $stderr.puts l + "\n" }
89
- err
96
+ $stderr.puts "Exception: #{err}\n\n"
97
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
98
+ err
90
99
  end
91
100
 
92
-
@@ -1,43 +1,57 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
- # @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
5
- # @license Artistic-2.0
6
- #
3
+ # frozen_string_literal: false
7
4
 
8
- require 'optparse'
5
+ $VERSION = 1.0
6
+ $:.push File.expand_path('../lib', __FILE__)
7
+ require 'enveomics_rb/enveomics'
9
8
 
10
- o = {q: false, rep: false}
11
- ARGV << '-h' if ARGV.size==0
9
+ o = { q: false, rep: false }
12
10
 
13
11
  OptionParser.new do |opt|
14
- opt.banner = "
15
- Samples a random set of sequences from a multi-FastA file.
16
-
17
- Usage: #{$0} [options]"
18
- opt.separator ''
12
+ Enveomics.opt_banner(
13
+ opt, 'Samples a random set of sequences from a multi-FastA file',
14
+ "#{File.basename($0)} -i seq.fa -o 10pc.fa -f 0.1 [options]"
15
+ )
19
16
  opt.separator 'Mandatory'
20
- opt.on('-i', '--in PATH', 'Input FastA file.'){ |v| o[:i] = v }
21
- opt.on('-o', '--out PATH', 'Output FastA file.'){ |v| o[:o] = v }
22
- opt.on('-f', '--fraction FLOAT',
17
+ opt.on(
18
+ '-i', '--in PATH',
19
+ 'Input FastA file',
20
+ 'Supports compression with .gz extension, use - for STDIN'
21
+ ) { |v| o[:i] = v }
22
+ opt.on(
23
+ '-o', '--out PATH',
24
+ 'Output FastA file',
25
+ 'Supports compression with .gz extension, use - for STDOUT'
26
+ ) { |v| o[:o] = v }
27
+ opt.on(
28
+ '-f', '--fraction FLOAT', Float,
23
29
  'Fraction of sequences to sample [0-1].',
24
- 'Mandatory unless -n is provided.'){ |v| o[:f] = v.to_f }
30
+ 'Mandatory unless -c is provided.'
31
+ ) { |v| o[:f] = v }
25
32
  opt.separator ''
33
+
26
34
  opt.separator 'Options'
27
- opt.on('-c', '--number INT',
28
- 'Number of sequences to sample.',
29
- 'Mandatory unless -f is provided.'){ |v| o[:n] = v.to_i }
30
- opt.on('-r', '--replacement','Sample with replacement'){ |v| o[:rep] = v }
31
- opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
35
+ opt.on(
36
+ '-c', '--number INT', Integer,
37
+ 'Number of sequences to sample',
38
+ 'Mandatory unless -f is provided'
39
+ ) { |v| o[:n] = v }
40
+ opt.on('-r', '--replacement','Sample with replacement') { |v| o[:rep] = v }
41
+ opt.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
32
42
  opt.on('-h', '--help', 'Display this screen.') do
33
43
  puts opt
34
44
  exit
35
45
  end
36
46
  opt.separator ''
37
47
  end.parse!
38
- abort '-i is mandatory.' if o[:i].nil?
39
- abort '-o is mandatory.' if o[:o].nil?
40
- abort '-f or -n is mandatory.' if o[:f].nil? and o[:n].nil?
48
+
49
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:i].nil?
50
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:o].nil?
51
+ if o[:f].nil? && o[:n].nil?
52
+ raise Enveomics::OptionError.new('-f or -n is mandatory')
53
+ end
54
+ $QUIET = o[:q]
41
55
 
42
56
  # Functions to parse sequences
43
57
  def do_stuff(id, sq)
@@ -53,31 +67,32 @@ def do_stuff(id, sq)
53
67
  end
54
68
 
55
69
  # Parse sequences
56
- $stderr.puts 'Parsing sequences' unless o[:q]
70
+ say 'Parsing sequences'
57
71
  seq = []
58
- File.open(o[:i], 'r') do |fh|
59
- id = nil
60
- sq = ''
61
- fh.each do |ln|
62
- next if ln =~ /^;/
63
- if ln =~ /^>(.+)/
64
- seq << [id, sq] unless id.nil?
65
- id = $1
66
- sq = ''
67
- else
68
- sq << ln
69
- end
72
+ fh = reader(o[:i])
73
+ id = nil
74
+ sq = ''
75
+ fh.each do |ln|
76
+ next if ln =~ /^;/
77
+ if ln =~ /^>(.+)/
78
+ seq << [id, sq] unless id.nil?
79
+ id = $1
80
+ sq = ''
81
+ else
82
+ sq << ln
70
83
  end
71
- seq << [id, sq] unless id.nil?
72
84
  end
73
- $stderr.puts " Input sequences: #{seq.size}"
85
+ seq << [id, sq] unless id.nil?
86
+ fh.close
87
+ say "Input sequences: #{seq.size}"
88
+
74
89
  o[:n] ||= (seq.size * o[:f]).round
75
- seq_o = o[:rep] ? o[:n].times.map{ seq.sample } : seq.sample(o[:n])
76
- File.open(o[:o], 'w') do |fh|
77
- seq_o.each do |i|
78
- fh.puts ">#{i[0]}"
79
- fh.puts i[1]
80
- end
90
+ seq_o = o[:rep] ? o[:n].times.map { seq.sample } : seq.sample(o[:n])
91
+ fh = writer(o[:o])
92
+ seq_o.each do |i|
93
+ fh.puts ">#{i[0]}"
94
+ fh.puts i[1]
81
95
  end
82
- $stderr.puts " Output sequences: #{seq_o.size}"
96
+ fh.close
97
+ say "Output sequences: #{seq_o.size}"
83
98
 
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'zlib'
5
+
6
+ o = { qual: 31, encoding: 33 }
7
+ ARGV << '-h' if ARGV.empty?
8
+ OptionParser.new do |opts|
9
+ opts.banner = "
10
+ Creates a FastQ-compliant file from a FastA file.
11
+
12
+ Usage: #{$0} [options]"
13
+ opts.separator ''
14
+ opts.separator 'Options'
15
+ opts.on(
16
+ '-i', '--in FILE', 'Input FastA file (supports .gz compression)'
17
+ ) { |v| o[:in] = v }
18
+ opts.on(
19
+ '-o', '--out FILE', 'Output FastQ file (supports .gz compression)'
20
+ ) { |v| o[:out] = v }
21
+ opts.on(
22
+ '-q', '--quality INT', Integer,
23
+ 'PHRED quality score to use (fixed), in the range [-5, 41]',
24
+ "By default: #{o[:qual]}"
25
+ ) { |v| o[:qual] = v }
26
+ opts.on(
27
+ '--encoding INT', Integer,
28
+ "Base encoding (33 or 64). By default: #{o[:encoding]}"
29
+ ) { |v| o[:encoding] = v }
30
+ opts.on('-h', '--help', 'Display this screen.') do
31
+ puts opts
32
+ exit
33
+ end
34
+ opts.separator ''
35
+ end.parse!
36
+ abort '-i is mandatory' if o[:in].nil?
37
+ abort '-o is mandatory' if o[:out].nil?
38
+ abort '-q must be in the range -5 .. 41' if o[:qual] < -5 || o[:qual] > 41
39
+
40
+ # Determine quality character
41
+ $qchar = (o[:qual] + o[:encoding]).chr
42
+
43
+ # Create file handlers
44
+ ifh = o[:in] =~ /\.gz$/ ?
45
+ Zlib::GzipReader.open(o[:in]) : File.open(o[:in], 'r')
46
+ ofh = o[:out] =~ /\.gz$/ ?
47
+ Zlib::GzipWriter.open(o[:out]) : File.open(o[:out], 'w')
48
+
49
+ def print_seq(ofh, id, seq)
50
+ ofh.puts "@#{id}", seq, '+', $qchar * seq.length unless seq.empty?
51
+ end
52
+
53
+ # Generate FastQ
54
+ id = ''
55
+ seq = ''
56
+ ifh.each_line do |ln|
57
+ next if ln =~ /^;/
58
+ if ln =~ /^>(.*)/
59
+ print_seq(ofh, id, seq)
60
+ seq = ''
61
+ id = $1
62
+ else
63
+ seq += ln.chomp.upcase.gsub(/[^A-Z]/,'')
64
+ end
65
+ end
66
+ print_seq(ofh, id, seq)
67
+ ofh.close
68
+ ifh.close
69
+