miga-base 0.4.3.0 → 0.5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -22,6 +22,10 @@ Usage: #{$0} [options]"
22
22
  opt.separator 'Options'
23
23
  opt.on('-a', '--aln-out FILE',
24
24
  'Output file containing the aligned proteins'){ |v| o[:alnout] = v }
25
+ opt.on('-c', '--components FILE',
26
+ 'Output file containing the components of the estimation.',
27
+ 'Tab-delimited file with model name, matches, and columns.'
28
+ ){ |v| o[:compout] = v }
25
29
  opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
26
30
  opt.on('-h', '--help', 'Display this screen.') do
27
31
  puts opt
@@ -34,6 +38,7 @@ abort '-2 is mandatory.' if o[:b].nil?
34
38
 
35
39
  class HList
36
40
  attr_accessor :list
41
+
37
42
  def initialize(file)
38
43
  @list = {}
39
44
  r = File.readlines(file)
@@ -63,6 +68,7 @@ end
63
68
  class HElement
64
69
  attr_accessor :defline, :model_id, :protein_id, :protein_coords
65
70
  attr_accessor :model_aln, :protein_aln
71
+
66
72
  def initialize(defline, model_aln, protein_aln)
67
73
  @defline = defline.chomp
68
74
  @model_aln = model_aln.chomp
@@ -81,32 +87,27 @@ class HElement
81
87
  ##
82
88
  # Returns an HAln object
83
89
  def align(other)
90
+ return nil unless model_width == other.model_width
84
91
  HAln.new(self, other)
85
92
  end
86
93
 
87
- def mask
88
- @mask ||= model_aln.chars.
89
- each_with_index.map{ |v, k| v == '.' ? k : nil }.
90
- compact.reverse
94
+ def masked_protein
95
+ @masked_protein ||= model_aln.chars.
96
+ each_with_index.map{ |c, pos| c == 'X' ? protein_aln[pos] : nil }.
97
+ compact.join('')
91
98
  end
92
99
 
93
- def mask!(template)
94
- (template - mask).each do |d|
95
- @model_aln[d] = '-' + @model_aln[d]
96
- @protein_aln[d] = '-' + @protein_aln[d]
97
- end
100
+ def model_width
101
+ masked_protein.size
98
102
  end
99
103
  end
100
104
 
101
105
  class HAln
102
106
  attr :protein_1, :protein_2, :model_id, :protein_1_id, :protein_2_id
107
+
103
108
  def initialize(a, b)
104
- a_masked = a.dup
105
- a_masked.mask! b.mask.reverse
106
- b_masked = b.dup
107
- b_masked.mask! b_masked.mask
108
- @protein_1 = a_masked.protein_aln
109
- @protein_2 = b_masked.protein_aln
109
+ @protein_1 = a.masked_protein
110
+ @protein_2 = b.masked_protein
110
111
  @model_id = a.model_id
111
112
  @protein_1_id = a.protein_id + '/' + a.protein_coords
112
113
  @protein_2_id = b.protein_id + '/' + b.protein_coords
@@ -116,7 +117,9 @@ class HAln
116
117
  @stats = { len: 0, gaps: 0, matches: 0 }
117
118
  return @stats unless @stats[:id].nil?
118
119
  protein_1.chars.each_with_index do |v, k|
120
+ # Ignore gaps in both proteins
119
121
  next if v == '-' and protein_2[k] == '-'
122
+ # Count matches
120
123
  @stats[:len] += 1
121
124
  if v == protein_2[k]
122
125
  @stats[:matches] += 1
@@ -124,16 +127,16 @@ class HAln
124
127
  @stats[:gaps] += 1
125
128
  end
126
129
  end
127
- @stats.tap { |i| i[:id] = 100.0 * @stats[:matches] / @stats[:len] }
130
+ @stats.tap { |i| i[:id] = 100.0 * i[:matches] / i[:len] }
128
131
  end
129
132
 
130
133
  def stats_to_s
131
- stats.map{ |k,v| "#{k}:#{v}" }.join " "
134
+ stats.map{ |k,v| "#{k}:#{v}" }.join ' '
132
135
  end
133
136
 
134
137
  def to_s
135
- "# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}\n" +
136
- protein_1 + "\n" + protein_2 + "\n"
138
+ ["# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}",
139
+ protein_1, protein_2, ''].join("\n")
137
140
  end
138
141
  end
139
142
 
@@ -151,8 +154,14 @@ puts "SD identity: #{sd_identity.round(2)}"
151
154
 
152
155
  if o[:alnout]
153
156
  File.open(o[:alnout], 'w') do |fh|
157
+ haln_arr.each { |i| fh.puts i }
158
+ end
159
+ end
160
+
161
+ if o[:compout]
162
+ File.open(o[:compout], 'w') do |fh|
154
163
  haln_arr.each do |i|
155
- fh.puts i
164
+ fh.puts "#{i.model_id}\t#{i.stats[:matches]}\t#{i.stats[:len]}"
156
165
  end
157
166
  end
158
167
  end
@@ -45,7 +45,7 @@ tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
45
45
  for uri in $(echo "$ftp" | tr ";" " ") ; do
46
46
  file="$dir/$(basename $uri)"
47
47
  curl "$uri" -o "$file"
48
- md5obs=$(md5value "$file")
48
+ md5obs=$(md5value "$file" 2> /dev/null)
49
49
  if [[ "$md5" == "$md5obs"* ]] ; then
50
50
  md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
51
51
  else
@@ -3,144 +3,177 @@
3
3
  # @author Luis M. Rodriguez-R
4
4
  # @license Artistic-2.0
5
5
 
6
- require "optparse"
7
- require "tmpdir"
6
+ require 'optparse'
7
+ require 'tmpdir'
8
+ require 'zlib'
8
9
  has_rest_client = true
9
10
  has_sqlite3 = true
10
11
  begin
11
- require "rubygems"
12
- require "restclient"
12
+ require 'rubygems'
13
+ require 'restclient'
13
14
  rescue LoadError
14
15
  has_rest_client = false
15
16
  end
16
17
  begin
17
- require "sqlite3"
18
+ require 'sqlite3'
18
19
  rescue LoadError
19
20
  has_sqlite3 = false
20
21
  end
21
22
 
22
- o = {bits:0, id:20, len:0, hits:50, q:false, bin:"", program:"blast+", thr:1,
23
- dec:2, auto:false, lookupfirst:false, dbrbm: true, nucl: false,
24
- len_fraction:0.0, max_actg:0.95}
25
- ARGV << "-h" if ARGV.size==0
23
+ o = {
24
+ bits: 0, id: 20, len: 0, hits: 50, q: false, bin: '', program: 'blast+',
25
+ thr: 1, dec: 2, auto: false, lookupfirst: false, dbrbm: true, nucl: false,
26
+ len_fraction: 0.0, max_actg: 0.95
27
+ }
28
+ ARGV << '-h' if ARGV.size == 0
26
29
  OptionParser.new do |opts|
27
30
  opts.banner = "
28
- Calculates the Average Amino acid Identity between two genomes.
31
+ Calculates the Average Amino Acid Identity between two genomes
29
32
 
30
33
  Usage: #{$0} [options]"
31
- opts.separator ""
32
- opts.separator "Mandatory"
33
- opts.on("-1", "--seq1 FILE",
34
- "Path to the FastA file containing the genome 1 (proteins)."
35
- ){ |v| o[:seq1] = v }
36
- opts.on("-2", "--seq2 FILE",
37
- "Path to the FastA file containing the genome 2 (proteins)."
38
- ){ |v| o[:seq2] = v }
34
+ opts.separator ''
35
+ opts.separator 'Mandatory'
36
+ opts.on(
37
+ '-1', '--seq1 FILE',
38
+ 'Path to the FastA file (.gz allowed) containing the genome 1 (proteins)'
39
+ ) { |v| o[:seq1] = v }
40
+ opts.on(
41
+ '-2', '--seq2 FILE',
42
+ 'Path to the FastA file (.gz allowed) containing the genome 2 (proteins)'
43
+ ) { |v| o[:seq2] = v }
39
44
  if has_rest_client
40
- opts.separator " Alternatively, you can supply the NCBI-acc of a " +
41
- "genome (nucleotides) with the format ncbi:CP014272 instead of files."
45
+ opts.separator ' Alternatively, you can supply the NCBI-acc of a ' +
46
+ 'genome (nucleotides) with the format ncbi:CP014272 instead of files'
42
47
  else
43
- opts.separator " Install rest-client to enable NCBI-acc support."
48
+ opts.separator ' Install rest-client to enable NCBI-acc support'
44
49
  end
45
- opts.separator ""
46
- opts.separator "Search Options"
47
- opts.on("-l", "--len INT",
48
- "Minimum alignment length (in residues). By default: #{o[:len]}."
49
- ){ |v| o[:len] = v.to_i }
50
- opts.on("-L", "--len-fraction NUM",
51
- "Minimum alignment length as a fraction of the shorter sequence",
52
- "(range 0-1). By default: #{o[:len_fraction]}."
53
- ){ |v| o[:len_fraction] = v.to_f }
54
- opts.on("-i", "--id NUM",
55
- "Minimum alignment identity (in %). By default: #{o[:id]}."
56
- ){ |v| o[:id] = v.to_f }
57
- opts.on("-s", "--bitscore NUM",
58
- "Minimum bit score (in bits). By default: #{o[:bits]}."
59
- ){ |v| o[:bits] = v.to_f }
60
- opts.on("-n", "--hits INT",
61
- "Minimum number of hits. By default: #{o[:hits]}."
62
- ){ |v| o[:hits] = v.to_i }
63
- opts.on("-N", "--nucl",
64
- "The input sequences are nucleotides (genes), not proteins."
65
- ){ |v| o[:nucl] = v }
66
- opts.on("--max-actg FLOAT",
67
- "Maximum fraction of ACTGN in the sequences before assuming nucleotides.",
68
- "By default: #{o[:max_actg]}."
69
- ){ |v| o[:max_actg] = v.to_f }
70
- opts.separator ""
71
- opts.separator "Software Options"
72
- opts.on("-b", "--bin DIR",
73
- "Path to the directory containing the binaries of the search program."
74
- ){ |v| o[:bin] = v }
75
- opts.on("-p", "--program STR",
76
- "Search program to be used. One of: blast+ (default), blast, blat, diamond."
77
- ){ |v| o[:program] = v }
78
- opts.on("-t", "--threads INT",
79
- "Number of parallel threads to be used. By default: #{o[:thr]}."
80
- ){ |v| o[:thr] = v.to_i }
81
- opts.separator ""
82
- opts.separator "SQLite3 Options"
83
- opts.on("-S", "--sqlite3 FILE",
84
- "Path to the SQLite3 database to create (or update) with the results."
85
- ){ |v| o[:sqlite3] = v }
86
- opts.separator " Install sqlite3 gem to enable database support." unless
87
- has_sqlite3
88
- opts.on("--name1 STR",
89
- "Name of --seq1 to use in --sqlite3. By default determined by filename."
90
- ){ |v| o[:seq1name] = v }
91
- opts.on("--name2 STR",
92
- "Name of --seq2 to use in --sqlite3. By default determined by filename."
93
- ){ |v| o[:seq2name] = v }
94
- opts.on("--[no-]save-rbm",
95
- "Save (or don't save) the reciprocal best matches in the --sqlite3 db.",
96
- "By default: #{o[:dbrbm]}."){ |v| o[:dbrbm] = !!v }
97
- opts.on("--lookup-first",
98
- "Indicates if the AAI should be looked up first in the database.",
99
- "Requires --sqlite3, --auto, --name1, and --name2.",
100
- "Incompatible with --res, --tab, --out, and --rbm."
101
- ){ |v| o[:lookupfirst] = v }
102
- opts.separator ""
103
- opts.separator "Other Output Options"
104
- opts.on("-d", "--dec INT",
50
+ opts.separator ''
51
+ opts.separator 'Search Options'
52
+ opts.on(
53
+ '-l', '--len INT', Integer,
54
+ "Minimum alignment length (in residues). By default: #{o[:len]}"
55
+ ) { |v| o[:len] = v }
56
+ opts.on(
57
+ '-L', '--len-fraction NUM', Float,
58
+ 'Minimum alignment length as a fraction of the shorter sequence',
59
+ "(range 0-1). By default: #{o[:len_fraction]}"
60
+ ) { |v| o[:len_fraction] = v }
61
+ opts.on(
62
+ '-i', '--id FLOAT', Float,
63
+ "Minimum alignment identity (in %). By default: #{o[:id]}"
64
+ ) { |v| o[:id] = v }
65
+ opts.on(
66
+ '-s', '--bitscore FLOAT', Float,
67
+ "Minimum bit score (in bits). By default: #{o[:bits]}"
68
+ ) { |v| o[:bits] = v }
69
+ opts.on(
70
+ '-n', '--hits INT', Integer,
71
+ "Minimum number of hits. By default: #{o[:hits]}"
72
+ ) { |v| o[:hits] = v }
73
+ opts.on(
74
+ '-N', '--nucl',
75
+ 'The input sequences are nucleotides (genes), not proteins'
76
+ ) { |v| o[:nucl] = v }
77
+ opts.on(
78
+ '--max-actg FLOAT', Float,
79
+ 'Maximum fraction of ACTGN in the sequences before assuming nucleotides',
80
+ "By default: #{o[:max_actg]}"
81
+ ) { |v| o[:max_actg] = v }
82
+ opts.separator ''
83
+ opts.separator 'Software Options'
84
+ opts.on(
85
+ '-b', '--bin DIR',
86
+ 'Path to the directory containing the binaries of the search program'
87
+ ) { |v| o[:bin] = v }
88
+ opts.on(
89
+ '-p', '--program STR',
90
+ 'Search program to be used. One of: blast+ (default), blast, blat, diamond'
91
+ ) { |v| o[:program] = v }
92
+ opts.on(
93
+ '-t', '--threads INT', Integer,
94
+ "Number of parallel threads to be used. By default: #{o[:thr]}"
95
+ ) { |v| o[:thr] = v }
96
+ opts.separator ''
97
+ opts.separator 'SQLite3 Options'
98
+ unless has_sqlite3
99
+ opts.separator ' Install sqlite3 gem to enable database support'
100
+ end
101
+ opts.on(
102
+ '-S', '--sqlite3 FILE',
103
+ 'Path to the SQLite3 database to create (or update) with the results'
104
+ ) { |v| o[:sqlite3] = v }
105
+ opts.on(
106
+ '--name1 STR',
107
+ 'Name of --seq1 to use in --sqlite3. By default determined by filename'
108
+ ) { |v| o[:seq1name] = v }
109
+ opts.on(
110
+ '--name2 STR',
111
+ 'Name of --seq2 to use in --sqlite3. By default determined by filename'
112
+ ) { |v| o[:seq2name] = v }
113
+ opts.on(
114
+ '--[no-]save-rbm',
115
+ 'Save (or don\'t save) the reciprocal best matches in the --sqlite3 db',
116
+ "By default: #{o[:dbrbm]}"
117
+ ) { |v| o[:dbrbm] = v }
118
+ opts.on(
119
+ '--lookup-first',
120
+ 'Indicates if the AAI should be looked up first in the database',
121
+ 'Requires --sqlite3, --auto, --name1, and --name2',
122
+ 'Incompatible with --res, --tab, --out, and --rbm'
123
+ ) { |v| o[:lookupfirst] = v }
124
+ opts.separator ''
125
+ opts.separator 'Other Output Options'
126
+ opts.on(
127
+ '-d', '--dec INT', Integer,
105
128
  "Decimal positions to report. By default: #{o[:dec]}"
106
- ){ |v| o[:dec] = v.to_i }
107
- opts.on("-R", "--rbm FILE",
108
- "Saves a file with the reciprocal best matches."){ |v| o[:rbm] = v }
109
- opts.on("-o", "--out FILE",
110
- "Saves a file describing the alignments used for two-way AAI."
111
- ){ |v| o[:out] = v }
112
- opts.on("-r", "--res FILE",
113
- "Saves a file with the final results."){ |v| o[:res] = v }
114
- opts.on("-T", "--tab FILE",
115
- "Saves a file with the final two-way results in a tab-delimited form.",
116
- "The columns are (in that order):",
117
- "AAI, standard deviation, proteins used, proteins in the smallest genome."
118
- ){ |v| o[:tab]=v }
119
- opts.on("-a", "--auto",
120
- "ONLY outputs the AAI value in STDOUT (or nothing, if calculation fails)."
121
- ){ o[:auto] = true }
122
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = true }
123
- opts.on("-h", "--help", "Display this screen") do
129
+ ) { |v| o[:dec] = v }
130
+ opts.on(
131
+ '-R', '--rbm FILE',
132
+ 'Saves a file with the reciprocal best matches'
133
+ ) { |v| o[:rbm] = v }
134
+ opts.on(
135
+ '-o', '--out FILE',
136
+ 'Saves a file describing the alignments used for two-way AAI'
137
+ ) { |v| o[:out] = v }
138
+ opts.on(
139
+ '-r', '--res FILE', 'Saves a file with the final results'
140
+ ) { |v| o[:res] = v }
141
+ opts.on(
142
+ '-T', '--tab FILE',
143
+ 'Saves a file with the final two-way results in a tab-delimited form',
144
+ 'The columns are (in that order):',
145
+ 'AAI, standard deviation, proteins used, proteins in the smallest genome'
146
+ ) { |v| o[:tab] = v }
147
+ opts.on(
148
+ '-a', '--auto',
149
+ 'ONLY outputs the AAI value in STDOUT (or nothing, if calculation fails)'
150
+ ) { o[:auto] = true }
151
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
152
+ opts.on('-h', '--help', 'Display this screen') do
124
153
  puts opts
125
154
  exit
126
155
  end
127
- opts.separator ""
156
+ opts.separator ''
128
157
  end.parse!
129
- abort "-1 is mandatory" if o[:seq1].nil?
130
- abort "-2 is mandatory" if o[:seq2].nil?
131
- abort "-p diamond is incompatible with -N" if o[:program]=="diamond" && o[:nucl]
132
- abort "SQLite3 requested (-S) but sqlite3 not supported. First install gem " +
133
- "sqlite3." unless o[:sqlite3].nil? or has_sqlite3
134
- o[:bin] = o[:bin]+"/" if o[:bin].size > 0
158
+
159
+ # Check input
160
+ abort '-1 is mandatory' if o[:seq1].nil?
161
+ abort '-2 is mandatory' if o[:seq2].nil?
162
+ if o[:program] == 'diamond' && o[:nucl]
163
+ abort '-p diamond is incompatible with -N'
164
+ end
165
+ unless o[:sqlite3].nil? or has_sqlite3
166
+ abort 'SQLite3 requested (-S) but sqlite3 not supported: gem install sqlite3'
167
+ end
168
+ o[:bin] = o[:bin] + '/' if o[:bin].size > 0
135
169
  if o[:lookupfirst]
136
- abort "--lookup-first needs --sqlite3" if o[:sqlite3].nil?
137
- abort "--lookup-first requires --auto" unless o[:auto]
138
- abort "--lookup-first requires --name1" if o[:seq1name].nil?
139
- abort "--lookup-first requires --name2" if o[:seq2name].nil?
140
- abort "--lookup-first conflicts with --res" unless o[:res].nil?
141
- abort "--lookup-first conflicts with --tab" unless o[:tab].nil?
142
- abort "--lookup-first conflicts with --out" unless o[:out].nil?
143
- abort "--lookup-first conflicts with --rbm" unless o[:rbm].nil?
170
+ abort '--lookup-first requires --name1' if o[:seq1name].nil?
171
+ abort '--lookup-first requires --name2' if o[:seq2name].nil?
172
+ abort '--lookup-first needs --sqlite3' if o[:sqlite3].nil?
173
+ abort '--lookup-first requires --auto' unless o[:auto]
174
+ %w[res tab out rbm].each do |k|
175
+ abort "--lookup-first conflicts with --#{k}" unless o[k.to_sym].nil?
176
+ end
144
177
  end
145
178
 
146
179
  # Create SQLite3 file
@@ -180,7 +213,7 @@ Dir.mktmpdir do |dir|
180
213
  abort "GIs are no longer supported by NCBI. Please use NCBI-acc instead." if
181
214
  /^gi:/.match(o[seq])
182
215
  acc = /^ncbi:(\S+)/.match(o[seq])
183
- if not acc.nil?
216
+ unless acc.nil?
184
217
  abort "NCBI-acc requested, but rest-client not supported. First " +
185
218
  "install gem rest-client." unless has_rest_client
186
219
  abort "NCBI-acc are currently not supported with --nucl. Please use " +
@@ -226,22 +259,24 @@ Dir.mktmpdir do |dir|
226
259
  seq_len[seq] = [0]
227
260
  actg_cnt[seq] = 0
228
261
  seqs = 0
229
- fi = File.open(o[seq], "r")
230
- fo = File.open("#{dir}/#{seq.to_s}.fa", "w")
231
- fi.each_line do |ln|
232
- if ln =~ /^>(\S+)/
233
- seqs += 1
234
- ori_ids[seq] << $1 unless o[:rbm].nil? and o[:sqlite3].nil?
235
- seq_len[seq][seqs] = 0
236
- fo.puts ">#{seqs}"
237
- else
238
- fo.puts ln
239
- seq_len[seq][seqs] += ln.chomp.gsub(/[^A-Za-z]/,"").length
240
- actg_cnt[seq] += ln.chomp.gsub(/[^ACTGNactgn]/,"").length
262
+ fi = File.extname(o[seq]) == '.gz' ?
263
+ Zlib::GzipReader.open(o[seq]) :
264
+ File.open(o[seq], 'r')
265
+ File.open("#{dir}/#{seq.to_s}.fa", 'w') do |fo|
266
+ fi.each_line do |ln|
267
+ if ln =~ /^>(\S+)/
268
+ seqs += 1
269
+ ori_ids[seq] << $1 unless o[:rbm].nil? and o[:sqlite3].nil?
270
+ seq_len[seq][seqs] = 0
271
+ fo.puts ">#{seqs}"
272
+ else
273
+ fo.puts ln
274
+ seq_len[seq][seqs] += ln.chomp.gsub(/[^A-Za-z]/,"").length
275
+ actg_cnt[seq] += ln.chomp.gsub(/[^ACTGNactgn]/,"").length
276
+ end
241
277
  end
242
278
  end
243
279
  fi.close
244
- fo.close
245
280
  unless o[:nucl]
246
281
  actg_frx = actg_cnt[seq].to_f/seq_len[seq].inject(:+).to_f
247
282
  abort "Input sequences appear to be nucleotides " +