miga-base 0.4.3.0 → 0.5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -22,6 +22,10 @@ Usage: #{$0} [options]"
|
|
22
22
|
opt.separator 'Options'
|
23
23
|
opt.on('-a', '--aln-out FILE',
|
24
24
|
'Output file containing the aligned proteins'){ |v| o[:alnout] = v }
|
25
|
+
opt.on('-c', '--components FILE',
|
26
|
+
'Output file containing the components of the estimation.',
|
27
|
+
'Tab-delimited file with model name, matches, and columns.'
|
28
|
+
){ |v| o[:compout] = v }
|
25
29
|
opt.on('-q', '--quiet', 'Run quietly (no STDERR output).'){ o[:q] = true }
|
26
30
|
opt.on('-h', '--help', 'Display this screen.') do
|
27
31
|
puts opt
|
@@ -34,6 +38,7 @@ abort '-2 is mandatory.' if o[:b].nil?
|
|
34
38
|
|
35
39
|
class HList
|
36
40
|
attr_accessor :list
|
41
|
+
|
37
42
|
def initialize(file)
|
38
43
|
@list = {}
|
39
44
|
r = File.readlines(file)
|
@@ -63,6 +68,7 @@ end
|
|
63
68
|
class HElement
|
64
69
|
attr_accessor :defline, :model_id, :protein_id, :protein_coords
|
65
70
|
attr_accessor :model_aln, :protein_aln
|
71
|
+
|
66
72
|
def initialize(defline, model_aln, protein_aln)
|
67
73
|
@defline = defline.chomp
|
68
74
|
@model_aln = model_aln.chomp
|
@@ -81,32 +87,27 @@ class HElement
|
|
81
87
|
##
|
82
88
|
# Returns an HAln object
|
83
89
|
def align(other)
|
90
|
+
return nil unless model_width == other.model_width
|
84
91
|
HAln.new(self, other)
|
85
92
|
end
|
86
93
|
|
87
|
-
def
|
88
|
-
@
|
89
|
-
each_with_index.map{ |
|
90
|
-
|
94
|
+
def masked_protein
|
95
|
+
@masked_protein ||= model_aln.chars.
|
96
|
+
each_with_index.map{ |c, pos| c == 'X' ? protein_aln[pos] : nil }.
|
97
|
+
compact.join('')
|
91
98
|
end
|
92
99
|
|
93
|
-
def
|
94
|
-
|
95
|
-
@model_aln[d] = '-' + @model_aln[d]
|
96
|
-
@protein_aln[d] = '-' + @protein_aln[d]
|
97
|
-
end
|
100
|
+
def model_width
|
101
|
+
masked_protein.size
|
98
102
|
end
|
99
103
|
end
|
100
104
|
|
101
105
|
class HAln
|
102
106
|
attr :protein_1, :protein_2, :model_id, :protein_1_id, :protein_2_id
|
107
|
+
|
103
108
|
def initialize(a, b)
|
104
|
-
|
105
|
-
|
106
|
-
b_masked = b.dup
|
107
|
-
b_masked.mask! b_masked.mask
|
108
|
-
@protein_1 = a_masked.protein_aln
|
109
|
-
@protein_2 = b_masked.protein_aln
|
109
|
+
@protein_1 = a.masked_protein
|
110
|
+
@protein_2 = b.masked_protein
|
110
111
|
@model_id = a.model_id
|
111
112
|
@protein_1_id = a.protein_id + '/' + a.protein_coords
|
112
113
|
@protein_2_id = b.protein_id + '/' + b.protein_coords
|
@@ -116,7 +117,9 @@ class HAln
|
|
116
117
|
@stats = { len: 0, gaps: 0, matches: 0 }
|
117
118
|
return @stats unless @stats[:id].nil?
|
118
119
|
protein_1.chars.each_with_index do |v, k|
|
120
|
+
# Ignore gaps in both proteins
|
119
121
|
next if v == '-' and protein_2[k] == '-'
|
122
|
+
# Count matches
|
120
123
|
@stats[:len] += 1
|
121
124
|
if v == protein_2[k]
|
122
125
|
@stats[:matches] += 1
|
@@ -124,16 +127,16 @@ class HAln
|
|
124
127
|
@stats[:gaps] += 1
|
125
128
|
end
|
126
129
|
end
|
127
|
-
@stats.tap { |i| i[:id] = 100.0 *
|
130
|
+
@stats.tap { |i| i[:id] = 100.0 * i[:matches] / i[:len] }
|
128
131
|
end
|
129
132
|
|
130
133
|
def stats_to_s
|
131
|
-
stats.map{ |k,v| "#{k}:#{v}" }.join
|
134
|
+
stats.map{ |k,v| "#{k}:#{v}" }.join ' '
|
132
135
|
end
|
133
136
|
|
134
137
|
def to_s
|
135
|
-
"# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}
|
136
|
-
protein_1
|
138
|
+
["# #{model_id} | #{protein_1_id} | #{protein_2_id} | #{stats_to_s}",
|
139
|
+
protein_1, protein_2, ''].join("\n")
|
137
140
|
end
|
138
141
|
end
|
139
142
|
|
@@ -151,8 +154,14 @@ puts "SD identity: #{sd_identity.round(2)}"
|
|
151
154
|
|
152
155
|
if o[:alnout]
|
153
156
|
File.open(o[:alnout], 'w') do |fh|
|
157
|
+
haln_arr.each { |i| fh.puts i }
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
if o[:compout]
|
162
|
+
File.open(o[:compout], 'w') do |fh|
|
154
163
|
haln_arr.each do |i|
|
155
|
-
fh.puts i
|
164
|
+
fh.puts "#{i.model_id}\t#{i.stats[:matches]}\t#{i.stats[:len]}"
|
156
165
|
end
|
157
166
|
end
|
158
167
|
end
|
@@ -45,7 +45,7 @@ tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
|
|
45
45
|
for uri in $(echo "$ftp" | tr ";" " ") ; do
|
46
46
|
file="$dir/$(basename $uri)"
|
47
47
|
curl "$uri" -o "$file"
|
48
|
-
md5obs=$(md5value "$file")
|
48
|
+
md5obs=$(md5value "$file" 2> /dev/null)
|
49
49
|
if [[ "$md5" == "$md5obs"* ]] ; then
|
50
50
|
md5=$(echo "$md5" | perl -pe 's/^[^;]+;//')
|
51
51
|
else
|
@@ -3,144 +3,177 @@
|
|
3
3
|
# @author Luis M. Rodriguez-R
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
|
-
require
|
7
|
-
require
|
6
|
+
require 'optparse'
|
7
|
+
require 'tmpdir'
|
8
|
+
require 'zlib'
|
8
9
|
has_rest_client = true
|
9
10
|
has_sqlite3 = true
|
10
11
|
begin
|
11
|
-
require
|
12
|
-
require
|
12
|
+
require 'rubygems'
|
13
|
+
require 'restclient'
|
13
14
|
rescue LoadError
|
14
15
|
has_rest_client = false
|
15
16
|
end
|
16
17
|
begin
|
17
|
-
require
|
18
|
+
require 'sqlite3'
|
18
19
|
rescue LoadError
|
19
20
|
has_sqlite3 = false
|
20
21
|
end
|
21
22
|
|
22
|
-
o = {
|
23
|
-
|
24
|
-
|
25
|
-
|
23
|
+
o = {
|
24
|
+
bits: 0, id: 20, len: 0, hits: 50, q: false, bin: '', program: 'blast+',
|
25
|
+
thr: 1, dec: 2, auto: false, lookupfirst: false, dbrbm: true, nucl: false,
|
26
|
+
len_fraction: 0.0, max_actg: 0.95
|
27
|
+
}
|
28
|
+
ARGV << '-h' if ARGV.size == 0
|
26
29
|
OptionParser.new do |opts|
|
27
30
|
opts.banner = "
|
28
|
-
Calculates the Average Amino
|
31
|
+
Calculates the Average Amino Acid Identity between two genomes
|
29
32
|
|
30
33
|
Usage: #{$0} [options]"
|
31
|
-
opts.separator
|
32
|
-
opts.separator
|
33
|
-
opts.on(
|
34
|
-
|
35
|
-
)
|
36
|
-
|
37
|
-
|
38
|
-
|
34
|
+
opts.separator ''
|
35
|
+
opts.separator 'Mandatory'
|
36
|
+
opts.on(
|
37
|
+
'-1', '--seq1 FILE',
|
38
|
+
'Path to the FastA file (.gz allowed) containing the genome 1 (proteins)'
|
39
|
+
) { |v| o[:seq1] = v }
|
40
|
+
opts.on(
|
41
|
+
'-2', '--seq2 FILE',
|
42
|
+
'Path to the FastA file (.gz allowed) containing the genome 2 (proteins)'
|
43
|
+
) { |v| o[:seq2] = v }
|
39
44
|
if has_rest_client
|
40
|
-
opts.separator
|
41
|
-
|
45
|
+
opts.separator ' Alternatively, you can supply the NCBI-acc of a ' +
|
46
|
+
'genome (nucleotides) with the format ncbi:CP014272 instead of files'
|
42
47
|
else
|
43
|
-
opts.separator
|
48
|
+
opts.separator ' Install rest-client to enable NCBI-acc support'
|
44
49
|
end
|
45
|
-
opts.separator
|
46
|
-
opts.separator
|
47
|
-
opts.on(
|
48
|
-
|
49
|
-
)
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
"Minimum
|
59
|
-
|
60
|
-
opts.on(
|
61
|
-
|
62
|
-
)
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
opts.on(
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
opts.
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
opts.on(
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
"
|
90
|
-
|
91
|
-
opts.
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
opts.on(
|
50
|
+
opts.separator ''
|
51
|
+
opts.separator 'Search Options'
|
52
|
+
opts.on(
|
53
|
+
'-l', '--len INT', Integer,
|
54
|
+
"Minimum alignment length (in residues). By default: #{o[:len]}"
|
55
|
+
) { |v| o[:len] = v }
|
56
|
+
opts.on(
|
57
|
+
'-L', '--len-fraction NUM', Float,
|
58
|
+
'Minimum alignment length as a fraction of the shorter sequence',
|
59
|
+
"(range 0-1). By default: #{o[:len_fraction]}"
|
60
|
+
) { |v| o[:len_fraction] = v }
|
61
|
+
opts.on(
|
62
|
+
'-i', '--id FLOAT', Float,
|
63
|
+
"Minimum alignment identity (in %). By default: #{o[:id]}"
|
64
|
+
) { |v| o[:id] = v }
|
65
|
+
opts.on(
|
66
|
+
'-s', '--bitscore FLOAT', Float,
|
67
|
+
"Minimum bit score (in bits). By default: #{o[:bits]}"
|
68
|
+
) { |v| o[:bits] = v }
|
69
|
+
opts.on(
|
70
|
+
'-n', '--hits INT', Integer,
|
71
|
+
"Minimum number of hits. By default: #{o[:hits]}"
|
72
|
+
) { |v| o[:hits] = v }
|
73
|
+
opts.on(
|
74
|
+
'-N', '--nucl',
|
75
|
+
'The input sequences are nucleotides (genes), not proteins'
|
76
|
+
) { |v| o[:nucl] = v }
|
77
|
+
opts.on(
|
78
|
+
'--max-actg FLOAT', Float,
|
79
|
+
'Maximum fraction of ACTGN in the sequences before assuming nucleotides',
|
80
|
+
"By default: #{o[:max_actg]}"
|
81
|
+
) { |v| o[:max_actg] = v }
|
82
|
+
opts.separator ''
|
83
|
+
opts.separator 'Software Options'
|
84
|
+
opts.on(
|
85
|
+
'-b', '--bin DIR',
|
86
|
+
'Path to the directory containing the binaries of the search program'
|
87
|
+
) { |v| o[:bin] = v }
|
88
|
+
opts.on(
|
89
|
+
'-p', '--program STR',
|
90
|
+
'Search program to be used. One of: blast+ (default), blast, blat, diamond'
|
91
|
+
) { |v| o[:program] = v }
|
92
|
+
opts.on(
|
93
|
+
'-t', '--threads INT', Integer,
|
94
|
+
"Number of parallel threads to be used. By default: #{o[:thr]}"
|
95
|
+
) { |v| o[:thr] = v }
|
96
|
+
opts.separator ''
|
97
|
+
opts.separator 'SQLite3 Options'
|
98
|
+
unless has_sqlite3
|
99
|
+
opts.separator ' Install sqlite3 gem to enable database support'
|
100
|
+
end
|
101
|
+
opts.on(
|
102
|
+
'-S', '--sqlite3 FILE',
|
103
|
+
'Path to the SQLite3 database to create (or update) with the results'
|
104
|
+
) { |v| o[:sqlite3] = v }
|
105
|
+
opts.on(
|
106
|
+
'--name1 STR',
|
107
|
+
'Name of --seq1 to use in --sqlite3. By default determined by filename'
|
108
|
+
) { |v| o[:seq1name] = v }
|
109
|
+
opts.on(
|
110
|
+
'--name2 STR',
|
111
|
+
'Name of --seq2 to use in --sqlite3. By default determined by filename'
|
112
|
+
) { |v| o[:seq2name] = v }
|
113
|
+
opts.on(
|
114
|
+
'--[no-]save-rbm',
|
115
|
+
'Save (or don\'t save) the reciprocal best matches in the --sqlite3 db',
|
116
|
+
"By default: #{o[:dbrbm]}"
|
117
|
+
) { |v| o[:dbrbm] = v }
|
118
|
+
opts.on(
|
119
|
+
'--lookup-first',
|
120
|
+
'Indicates if the AAI should be looked up first in the database',
|
121
|
+
'Requires --sqlite3, --auto, --name1, and --name2',
|
122
|
+
'Incompatible with --res, --tab, --out, and --rbm'
|
123
|
+
) { |v| o[:lookupfirst] = v }
|
124
|
+
opts.separator ''
|
125
|
+
opts.separator 'Other Output Options'
|
126
|
+
opts.on(
|
127
|
+
'-d', '--dec INT', Integer,
|
105
128
|
"Decimal positions to report. By default: #{o[:dec]}"
|
106
|
-
|
107
|
-
opts.on(
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
129
|
+
) { |v| o[:dec] = v }
|
130
|
+
opts.on(
|
131
|
+
'-R', '--rbm FILE',
|
132
|
+
'Saves a file with the reciprocal best matches'
|
133
|
+
) { |v| o[:rbm] = v }
|
134
|
+
opts.on(
|
135
|
+
'-o', '--out FILE',
|
136
|
+
'Saves a file describing the alignments used for two-way AAI'
|
137
|
+
) { |v| o[:out] = v }
|
138
|
+
opts.on(
|
139
|
+
'-r', '--res FILE', 'Saves a file with the final results'
|
140
|
+
) { |v| o[:res] = v }
|
141
|
+
opts.on(
|
142
|
+
'-T', '--tab FILE',
|
143
|
+
'Saves a file with the final two-way results in a tab-delimited form',
|
144
|
+
'The columns are (in that order):',
|
145
|
+
'AAI, standard deviation, proteins used, proteins in the smallest genome'
|
146
|
+
) { |v| o[:tab] = v }
|
147
|
+
opts.on(
|
148
|
+
'-a', '--auto',
|
149
|
+
'ONLY outputs the AAI value in STDOUT (or nothing, if calculation fails)'
|
150
|
+
) { o[:auto] = true }
|
151
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
152
|
+
opts.on('-h', '--help', 'Display this screen') do
|
124
153
|
puts opts
|
125
154
|
exit
|
126
155
|
end
|
127
|
-
opts.separator
|
156
|
+
opts.separator ''
|
128
157
|
end.parse!
|
129
|
-
|
130
|
-
|
131
|
-
abort
|
132
|
-
abort
|
133
|
-
|
134
|
-
|
158
|
+
|
159
|
+
# Check input
|
160
|
+
abort '-1 is mandatory' if o[:seq1].nil?
|
161
|
+
abort '-2 is mandatory' if o[:seq2].nil?
|
162
|
+
if o[:program] == 'diamond' && o[:nucl]
|
163
|
+
abort '-p diamond is incompatible with -N'
|
164
|
+
end
|
165
|
+
unless o[:sqlite3].nil? or has_sqlite3
|
166
|
+
abort 'SQLite3 requested (-S) but sqlite3 not supported: gem install sqlite3'
|
167
|
+
end
|
168
|
+
o[:bin] = o[:bin] + '/' if o[:bin].size > 0
|
135
169
|
if o[:lookupfirst]
|
136
|
-
abort
|
137
|
-
abort
|
138
|
-
abort
|
139
|
-
abort
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
abort "--lookup-first conflicts with --rbm" unless o[:rbm].nil?
|
170
|
+
abort '--lookup-first requires --name1' if o[:seq1name].nil?
|
171
|
+
abort '--lookup-first requires --name2' if o[:seq2name].nil?
|
172
|
+
abort '--lookup-first needs --sqlite3' if o[:sqlite3].nil?
|
173
|
+
abort '--lookup-first requires --auto' unless o[:auto]
|
174
|
+
%w[res tab out rbm].each do |k|
|
175
|
+
abort "--lookup-first conflicts with --#{k}" unless o[k.to_sym].nil?
|
176
|
+
end
|
144
177
|
end
|
145
178
|
|
146
179
|
# Create SQLite3 file
|
@@ -180,7 +213,7 @@ Dir.mktmpdir do |dir|
|
|
180
213
|
abort "GIs are no longer supported by NCBI. Please use NCBI-acc instead." if
|
181
214
|
/^gi:/.match(o[seq])
|
182
215
|
acc = /^ncbi:(\S+)/.match(o[seq])
|
183
|
-
|
216
|
+
unless acc.nil?
|
184
217
|
abort "NCBI-acc requested, but rest-client not supported. First " +
|
185
218
|
"install gem rest-client." unless has_rest_client
|
186
219
|
abort "NCBI-acc are currently not supported with --nucl. Please use " +
|
@@ -226,22 +259,24 @@ Dir.mktmpdir do |dir|
|
|
226
259
|
seq_len[seq] = [0]
|
227
260
|
actg_cnt[seq] = 0
|
228
261
|
seqs = 0
|
229
|
-
fi = File.
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
262
|
+
fi = File.extname(o[seq]) == '.gz' ?
|
263
|
+
Zlib::GzipReader.open(o[seq]) :
|
264
|
+
File.open(o[seq], 'r')
|
265
|
+
File.open("#{dir}/#{seq.to_s}.fa", 'w') do |fo|
|
266
|
+
fi.each_line do |ln|
|
267
|
+
if ln =~ /^>(\S+)/
|
268
|
+
seqs += 1
|
269
|
+
ori_ids[seq] << $1 unless o[:rbm].nil? and o[:sqlite3].nil?
|
270
|
+
seq_len[seq][seqs] = 0
|
271
|
+
fo.puts ">#{seqs}"
|
272
|
+
else
|
273
|
+
fo.puts ln
|
274
|
+
seq_len[seq][seqs] += ln.chomp.gsub(/[^A-Za-z]/,"").length
|
275
|
+
actg_cnt[seq] += ln.chomp.gsub(/[^ACTGNactgn]/,"").length
|
276
|
+
end
|
241
277
|
end
|
242
278
|
end
|
243
279
|
fi.close
|
244
|
-
fo.close
|
245
280
|
unless o[:nucl]
|
246
281
|
actg_frx = actg_cnt[seq].to_f/seq_len[seq].inject(:+).to_f
|
247
282
|
abort "Input sequences appear to be nucleotides " +
|