miga-base 0.4.3.0 → 0.5.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -22,6 +22,10 @@ Usage: #{$0} [options]"
|
|
22
22
|
opt.separator 'Options'
|
23
23
|
opt.on('-a', '--aln-out FILE',
|
24
24
|
'Output file containing the aligned proteins'){ |v| o[:alnout] = v }
|
25
|
+
opt.on('-c', '--components FILE',
|
26
|
+
'Output file containing the components of the estimation.',
|
27
|
+
'Tab-delimited file with model name, matches, and columns.'
|
28
|
+
){ |v| o[:compout] = v }
|
25
29
|
opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
26
30
|
opt.on('-h', '--help', 'Display this screen.') do
|
27
31
|
puts opt
|
@@ -34,6 +38,7 @@ abort '-2 is mandatory.' if o[:b].nil?
|
|
34
38
|
|
35
39
|
class HList
|
36
40
|
attr_accessor :list
|
41
|
+
|
37
42
|
def initialize(file)
|
38
43
|
@list = {}
|
39
44
|
r = File.readlines(file)
|
@@ -63,6 +68,7 @@ end
|
|
63
68
|
class HElement
|
64
69
|
attr_accessor :defline, :model_id, :protein_id, :protein_coords
|
65
70
|
attr_accessor :model_aln, :protein_aln
|
71
|
+
|
66
72
|
def initialize(defline, model_aln, protein_aln)
|
67
73
|
@defline = defline.chomp
|
68
74
|
@model_aln = model_aln.chomp
|
@@ -81,32 +87,27 @@ class HElement
|
|
81
87
|
##
|
82
88
|
# Returns an HAln object
|
83
89
|
def align(other)
|
90
|
+
return nil unless model_width == other.model_width
|
84
91
|
HAln.new(self, other)
|
85
92
|
end
|
86
93
|
|
87
|
-
def
|
88
|
-
@
|
89
|
-
each_with_index.map{ |
|
90
|
-
|
94
|
+
def masked_protein
|
95
|
+
@masked_protein ||= model_aln.chars.
|
96
|
+
each_with_index.map{ |c, pos| c == 'X' ? protein_aln[pos] : nil }.
|
97
|
+
compact.join('')
|
91
98
|
end
|
92
99
|
|
93
|
-
def
|
94
|
-
|
95
|
-
@model_aln[d] = '-' + @model_aln[d]
|
96
|
-
@protein_aln[d] = '-' + @protein_aln[d]
|
97
|
-
end
|
100
|
+
def model_width
|
101
|
+
masked_protein.size
|
98
102
|
end
|
99
103
|
end
|
100
104
|
|
101
105
|
class HAln
|
102
106
|
attr :protein_1, :protein_2, :model_id, :protein_1_id, :protein_2_id
|
107
|
+
|
103
108
|
def initialize(a, b)
|
104
|
-
|
105
|
-
|
106
|
-
b_masked = b.dup
|
107
|
-
b_masked.mask! b_masked.mask
|
108
|
-
@protein_1 = a_masked.protein_aln
|
109
|
-
@protein_2 = b_masked.protein_aln
|
109
|
+
@protein_1 = a.masked_protein
|
110
|
+
@protein_2 = b.masked_protein
|
110
111
|
@model_id = a.model_id
|
111
112
|
@protein_1_id = a.protein_id + '/' + a.protein_coords
|
112
113
|
@protein_2_id = b.protein_id + '/' + b.protein_coords
|
@@ -116,7 +117,9 @@ class HAln
|
|
116
117
|
@stats = { len: 0, gaps: 0, matches: 0 }
|
117
118
|
return @stats unless @stats[:id].nil?
|
118
119
|
protein_1.chars.each_with_index do |v, k|
|
120
|
+
# Ignore gaps in both proteins
|
119
121
|
next if v == '-' and protein_2[k] == '-'
|
122
|
+
# Count matches
|
120
123
|
@stats[:len] += 1
|
121
124
|
if v == protein_2[k]
|
122
125
|
@stats[:matches] += 1
|
@@ -124,16 +127,16 @@ class HAln
|
|
124
127
|
@stats[:gaps] += 1
|
125
128
|
end
|
126
129
|
end
|
127
|
-
@stats.tap { |i| i[:id] = 100.0 *
|
130
|
+
@stats.tap { |i| i[:id] = 100.0 * i[:matches] / i[:len] }
|
128
131
|
end
|
129
132
|
|
130
133
|
def stats_to_s
|
131
|
-
stats.map{ |k,v| "#{k}:#{v}" }.join
|
134
|
+
stats.map{ |k,v| "#{k}:#{v}" }.join ' '
|
132
135
|
end
|
133
136
|
|
134
137
|
def to_s
|
135
|
-
"# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}
|
136
|
-
protein_1
|
138
|
+
["# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}",
|
139
|
+
protein_1, protein_2, ''].join("\n")
|
137
140
|
end
|
138
141
|
end
|
139
142
|
|
@@ -151,8 +154,14 @@ puts "SD identity: #{sd_identity.round(2)}"
|
|
151
154
|
|
152
155
|
if o[:alnout]
|
153
156
|
File.open(o[:alnout], 'w') do |fh|
|
157
|
+
haln_arr.each { |i| fh.puts i }
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
if o[:compout]
|
162
|
+
File.open(o[:compout], 'w') do |fh|
|
154
163
|
haln_arr.each do |i|
|
155
|
-
fh.puts i
|
164
|
+
fh.puts "#{i.model_id}\t#{i.stats[:matches]}\t#{i.stats[:len]}"
|
156
165
|
end
|
157
166
|
end
|
158
167
|
end
|
@@ -45,7 +45,7 @@ tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
|
|
45
45
|
for uri in $(echo "$ftp" | tr ";" " ") ; do
|
46
46
|
file="$dir/$(basename $uri)"
|
47
47
|
curl "$uri" -o "$file"
|
48
|
-
md5obs=$(md5value "$file")
|
48
|
+
md5obs=$(md5value "$file" 2> /dev/null)
|
49
49
|
if [[ "$md5" == "$md5obs"* ]] ; then
|
50
50
|
md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
|
51
51
|
else
|
@@ -3,144 +3,177 @@
|
|
3
3
|
# @author Luis M. Rodriguez-R
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
|
-
require
|
7
|
-
require
|
6
|
+
require 'optparse'
|
7
|
+
require 'tmpdir'
|
8
|
+
require 'zlib'
|
8
9
|
has_rest_client = true
|
9
10
|
has_sqlite3 = true
|
10
11
|
begin
|
11
|
-
require
|
12
|
-
require
|
12
|
+
require 'rubygems'
|
13
|
+
require 'restclient'
|
13
14
|
rescue LoadError
|
14
15
|
has_rest_client = false
|
15
16
|
end
|
16
17
|
begin
|
17
|
-
require
|
18
|
+
require 'sqlite3'
|
18
19
|
rescue LoadError
|
19
20
|
has_sqlite3 = false
|
20
21
|
end
|
21
22
|
|
22
|
-
o = {
|
23
|
-
|
24
|
-
|
25
|
-
|
23
|
+
o = {
|
24
|
+
bits: 0, id: 20, len: 0, hits: 50, q: false, bin: '', program: 'blast+',
|
25
|
+
thr: 1, dec: 2, auto: false, lookupfirst: false, dbrbm: true, nucl: false,
|
26
|
+
len_fraction: 0.0, max_actg: 0.95
|
27
|
+
}
|
28
|
+
ARGV << '-h' if ARGV.size == 0
|
26
29
|
OptionParser.new do |opts|
|
27
30
|
opts.banner = "
|
28
|
-
Calculates the Average Amino
|
31
|
+
Calculates the Average Amino Acid Identity between two genomes
|
29
32
|
|
30
33
|
Usage: #{$0} [options]"
|
31
|
-
opts.separator
|
32
|
-
opts.separator
|
33
|
-
opts.on(
|
34
|
-
|
35
|
-
)
|
36
|
-
|
37
|
-
|
38
|
-
|
34
|
+
opts.separator ''
|
35
|
+
opts.separator 'Mandatory'
|
36
|
+
opts.on(
|
37
|
+
'-1', '--seq1 FILE',
|
38
|
+
'Path to the FastA file (.gz allowed) containing the genome 1 (proteins)'
|
39
|
+
) { |v| o[:seq1] = v }
|
40
|
+
opts.on(
|
41
|
+
'-2', '--seq2 FILE',
|
42
|
+
'Path to the FastA file (.gz allowed) containing the genome 2 (proteins)'
|
43
|
+
) { |v| o[:seq2] = v }
|
39
44
|
if has_rest_client
|
40
|
-
opts.separator
|
41
|
-
|
45
|
+
opts.separator ' Alternatively, you can supply the NCBI-acc of a ' +
|
46
|
+
'genome (nucleotides) with the format ncbi:CP014272 instead of files'
|
42
47
|
else
|
43
|
-
opts.separator
|
48
|
+
opts.separator ' Install rest-client to enable NCBI-acc support'
|
44
49
|
end
|
45
|
-
opts.separator
|
46
|
-
opts.separator
|
47
|
-
opts.on(
|
48
|
-
|
49
|
-
)
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
"Minimum
|
59
|
-
|
60
|
-
opts.on(
|
61
|
-
|
62
|
-
)
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
opts.on(
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
opts.
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
opts.on(
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
"
|
90
|
-
|
91
|
-
opts.
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
opts.on(
|
50
|
+
opts.separator ''
|
51
|
+
opts.separator 'Search Options'
|
52
|
+
opts.on(
|
53
|
+
'-l', '--len INT', Integer,
|
54
|
+
"Minimum alignment length (in residues). By default: #{o[:len]}"
|
55
|
+
) { |v| o[:len] = v }
|
56
|
+
opts.on(
|
57
|
+
'-L', '--len-fraction NUM', Float,
|
58
|
+
'Minimum alignment length as a fraction of the shorter sequence',
|
59
|
+
"(range 0-1). By default: #{o[:len_fraction]}"
|
60
|
+
) { |v| o[:len_fraction] = v }
|
61
|
+
opts.on(
|
62
|
+
'-i', '--id FLOAT', Float,
|
63
|
+
"Minimum alignment identity (in %). By default: #{o[:id]}"
|
64
|
+
) { |v| o[:id] = v }
|
65
|
+
opts.on(
|
66
|
+
'-s', '--bitscore FLOAT', Float,
|
67
|
+
"Minimum bit score (in bits). By default: #{o[:bits]}"
|
68
|
+
) { |v| o[:bits] = v }
|
69
|
+
opts.on(
|
70
|
+
'-n', '--hits INT', Integer,
|
71
|
+
"Minimum number of hits. By default: #{o[:hits]}"
|
72
|
+
) { |v| o[:hits] = v }
|
73
|
+
opts.on(
|
74
|
+
'-N', '--nucl',
|
75
|
+
'The input sequences are nucleotides (genes), not proteins'
|
76
|
+
) { |v| o[:nucl] = v }
|
77
|
+
opts.on(
|
78
|
+
'--max-actg FLOAT', Float,
|
79
|
+
'Maximum fraction of ACTGN in the sequences before assuming nucleotides',
|
80
|
+
"By default: #{o[:max_actg]}"
|
81
|
+
) { |v| o[:max_actg] = v }
|
82
|
+
opts.separator ''
|
83
|
+
opts.separator 'Software Options'
|
84
|
+
opts.on(
|
85
|
+
'-b', '--bin DIR',
|
86
|
+
'Path to the directory containing the binaries of the search program'
|
87
|
+
) { |v| o[:bin] = v }
|
88
|
+
opts.on(
|
89
|
+
'-p', '--program STR',
|
90
|
+
'Search program to be used. One of: blast+ (default), blast, blat, diamond'
|
91
|
+
) { |v| o[:program] = v }
|
92
|
+
opts.on(
|
93
|
+
'-t', '--threads INT', Integer,
|
94
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}"
|
95
|
+
) { |v| o[:thr] = v }
|
96
|
+
opts.separator ''
|
97
|
+
opts.separator 'SQLite3 Options'
|
98
|
+
unless has_sqlite3
|
99
|
+
opts.separator ' Install sqlite3 gem to enable database support'
|
100
|
+
end
|
101
|
+
opts.on(
|
102
|
+
'-S', '--sqlite3 FILE',
|
103
|
+
'Path to the SQLite3 database to create (or update) with the results'
|
104
|
+
) { |v| o[:sqlite3] = v }
|
105
|
+
opts.on(
|
106
|
+
'--name1 STR',
|
107
|
+
'Name of --seq1 to use in --sqlite3. By default determined by filename'
|
108
|
+
) { |v| o[:seq1name] = v }
|
109
|
+
opts.on(
|
110
|
+
'--name2 STR',
|
111
|
+
'Name of --seq2 to use in --sqlite3. By default determined by filename'
|
112
|
+
) { |v| o[:seq2name] = v }
|
113
|
+
opts.on(
|
114
|
+
'--[no-]save-rbm',
|
115
|
+
'Save (or don\'t save) the reciprocal best matches in the --sqlite3 db',
|
116
|
+
"By default: #{o[:dbrbm]}"
|
117
|
+
) { |v| o[:dbrbm] = v }
|
118
|
+
opts.on(
|
119
|
+
'--lookup-first',
|
120
|
+
'Indicates if the AAI should be looked up first in the database',
|
121
|
+
'Requires --sqlite3, --auto, --name1, and --name2',
|
122
|
+
'Incompatible with --res, --tab, --out, and --rbm'
|
123
|
+
) { |v| o[:lookupfirst] = v }
|
124
|
+
opts.separator ''
|
125
|
+
opts.separator 'Other Output Options'
|
126
|
+
opts.on(
|
127
|
+
'-d', '--dec INT', Integer,
|
105
128
|
"Decimal positions to report. By default: #{o[:dec]}"
|
106
|
-
|
107
|
-
opts.on(
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
129
|
+
) { |v| o[:dec] = v }
|
130
|
+
opts.on(
|
131
|
+
'-R', '--rbm FILE',
|
132
|
+
'Saves a file with the reciprocal best matches'
|
133
|
+
) { |v| o[:rbm] = v }
|
134
|
+
opts.on(
|
135
|
+
'-o', '--out FILE',
|
136
|
+
'Saves a file describing the alignments used for two-way AAI'
|
137
|
+
) { |v| o[:out] = v }
|
138
|
+
opts.on(
|
139
|
+
'-r', '--res FILE', 'Saves a file with the final results'
|
140
|
+
) { |v| o[:res] = v }
|
141
|
+
opts.on(
|
142
|
+
'-T', '--tab FILE',
|
143
|
+
'Saves a file with the final two-way results in a tab-delimited form',
|
144
|
+
'The columns are (in that order):',
|
145
|
+
'AAI, standard deviation, proteins used, proteins in the smallest genome'
|
146
|
+
) { |v| o[:tab] = v }
|
147
|
+
opts.on(
|
148
|
+
'-a', '--auto',
|
149
|
+
'ONLY outputs the AAI value in STDOUT (or nothing, if calculation fails)'
|
150
|
+
) { o[:auto] = true }
|
151
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
152
|
+
opts.on('-h', '--help', 'Display this screen') do
|
124
153
|
puts opts
|
125
154
|
exit
|
126
155
|
end
|
127
|
-
opts.separator
|
156
|
+
opts.separator ''
|
128
157
|
end.parse!
|
129
|
-
|
130
|
-
|
131
|
-
abort
|
132
|
-
abort
|
133
|
-
|
134
|
-
|
158
|
+
|
159
|
+
# Check input
|
160
|
+
abort '-1 is mandatory' if o[:seq1].nil?
|
161
|
+
abort '-2 is mandatory' if o[:seq2].nil?
|
162
|
+
if o[:program] == 'diamond' && o[:nucl]
|
163
|
+
abort '-p diamond is incompatible with -N'
|
164
|
+
end
|
165
|
+
unless o[:sqlite3].nil? or has_sqlite3
|
166
|
+
abort 'SQLite3 requested (-S) but sqlite3 not supported: gem install sqlite3'
|
167
|
+
end
|
168
|
+
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
135
169
|
if o[:lookupfirst]
|
136
|
-
abort
|
137
|
-
abort
|
138
|
-
abort
|
139
|
-
abort
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
abort "--lookup-first conflicts with --rbm" unless o[:rbm].nil?
|
170
|
+
abort '--lookup-first requires --name1' if o[:seq1name].nil?
|
171
|
+
abort '--lookup-first requires --name2' if o[:seq2name].nil?
|
172
|
+
abort '--lookup-first needs --sqlite3' if o[:sqlite3].nil?
|
173
|
+
abort '--lookup-first requires --auto' unless o[:auto]
|
174
|
+
%w[res tab out rbm].each do |k|
|
175
|
+
abort "--lookup-first conflicts with --#{k}" unless o[k.to_sym].nil?
|
176
|
+
end
|
144
177
|
end
|
145
178
|
|
146
179
|
# Create SQLite3 file
|
@@ -180,7 +213,7 @@ Dir.mktmpdir do |dir|
|
|
180
213
|
abort "GIs are no longer supported by NCBI. Please use NCBI-acc instead." if
|
181
214
|
/^gi:/.match(o[seq])
|
182
215
|
acc = /^ncbi:(\S+)/.match(o[seq])
|
183
|
-
|
216
|
+
unless acc.nil?
|
184
217
|
abort "NCBI-acc requested, but rest-client not supported. First " +
|
185
218
|
"install gem rest-client." unless has_rest_client
|
186
219
|
abort "NCBI-acc are currently not supported with --nucl. Please use " +
|
@@ -226,22 +259,24 @@ Dir.mktmpdir do |dir|
|
|
226
259
|
seq_len[seq] = [0]
|
227
260
|
actg_cnt[seq] = 0
|
228
261
|
seqs = 0
|
229
|
-
fi = File.
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
262
|
+
fi = File.extname(o[seq]) == '.gz' ?
|
263
|
+
Zlib::GzipReader.open(o[seq]) :
|
264
|
+
File.open(o[seq], 'r')
|
265
|
+
File.open("#{dir}/#{seq.to_s}.fa", 'w') do |fo|
|
266
|
+
fi.each_line do |ln|
|
267
|
+
if ln =~ /^>(\S+)/
|
268
|
+
seqs += 1
|
269
|
+
ori_ids[seq] << $1 unless o[:rbm].nil? and o[:sqlite3].nil?
|
270
|
+
seq_len[seq][seqs] = 0
|
271
|
+
fo.puts ">#{seqs}"
|
272
|
+
else
|
273
|
+
fo.puts ln
|
274
|
+
seq_len[seq][seqs] += ln.chomp.gsub(/[^A-Za-z]/,"").length
|
275
|
+
actg_cnt[seq] += ln.chomp.gsub(/[^ACTGNactgn]/,"").length
|
276
|
+
end
|
241
277
|
end
|
242
278
|
end
|
243
279
|
fi.close
|
244
|
-
fo.close
|
245
280
|
unless o[:nucl]
|
246
281
|
actg_frx = actg_cnt[seq].to_f/seq_len[seq].inject(:+).to_f
|
247
282
|
abort "Input sequences appear to be nucleotides " +
|