genevalidator 1.6.1 → 1.6.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/README.md +78 -30
- data/Rakefile +11 -8
- data/aux/app_template_footer.erb +1 -6
- data/aux/app_template_header.erb +12 -32
- data/aux/files/css/style.css +2 -8
- data/aux/files/js/plots.js +564 -576
- data/aux/files/js/script.js +10 -0
- data/aux/json_footer.erb +8 -0
- data/aux/json_header.erb +19 -0
- data/aux/json_query.erb +14 -0
- data/aux/template_footer.erb +9 -58
- data/aux/template_header.erb +18 -58
- data/aux/template_query.erb +8 -36
- data/bin/genevalidator +45 -32
- data/genevalidator.gemspec +11 -7
- data/lib/genevalidator.rb +75 -455
- data/lib/genevalidator/arg_validation.rb +78 -107
- data/lib/genevalidator/blast.rb +57 -60
- data/lib/genevalidator/clusterization.rb +15 -15
- data/lib/genevalidator/exceptions.rb +32 -5
- data/lib/genevalidator/get_raw_sequences.rb +70 -33
- data/lib/genevalidator/hsp.rb +1 -4
- data/lib/genevalidator/json_to_gv_results.rb +109 -0
- data/lib/genevalidator/output.rb +177 -185
- data/lib/genevalidator/pool.rb +2 -1
- data/lib/genevalidator/sequences.rb +3 -3
- data/lib/genevalidator/tabular_parser.rb +24 -18
- data/lib/genevalidator/validation.rb +279 -0
- data/lib/genevalidator/validation_alignment.rb +31 -47
- data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
- data/lib/genevalidator/validation_duplication.rb +23 -19
- data/lib/genevalidator/validation_gene_merge.rb +30 -65
- data/lib/genevalidator/validation_length_cluster.rb +14 -53
- data/lib/genevalidator/validation_length_rank.rb +10 -11
- data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
- data/lib/genevalidator/validation_report.rb +2 -5
- data/lib/genevalidator/validation_test.rb +8 -4
- data/lib/genevalidator/version.rb +1 -1
- data/test/test_all_validations.rb +51 -66
- data/test/test_blast.rb +68 -51
- data/test/test_clusterization.rb +1 -1
- data/test/test_clusterization_2d.rb +19 -13
- data/test/test_extended_array_methods.rb +1 -1
- data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
- data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
- data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
- data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
- data/test/test_sequences.rb +46 -41
- data/test/test_validation_open_reading_frame.rb +318 -202
- data/test/test_validations.rb +48 -32
- metadata +76 -102
- data/doc/AliasDuplicationError.html +0 -134
- data/doc/AlignmentValidation.html +0 -1687
- data/doc/AlignmentValidationOutput.html +0 -659
- data/doc/Blast.html +0 -1905
- data/doc/BlastRFValidationOutput.html +0 -545
- data/doc/BlastReadingFrameValidation.html +0 -370
- data/doc/BlastUtils.html +0 -875
- data/doc/ClasspathError.html +0 -134
- data/doc/Cluster.html +0 -1316
- data/doc/DuplciationValidationOutput.html +0 -564
- data/doc/DuplicationValidation.html +0 -920
- data/doc/DuplicationValidationOutput.html +0 -564
- data/doc/FileNotFoundException.html +0 -134
- data/doc/GeneMergeValidation.html +0 -935
- data/doc/GeneMergeValidationOutput.html +0 -652
- data/doc/HierarchicalClusterization.html +0 -994
- data/doc/Hsp.html +0 -1485
- data/doc/InconsistentTabularFormat.html +0 -135
- data/doc/LengthClusterValidation.html +0 -982
- data/doc/LengthClusterValidationOutput.html +0 -515
- data/doc/LengthRankValidation.html +0 -496
- data/doc/LengthRankValidationOutput.html +0 -517
- data/doc/NoInternetError.html +0 -135
- data/doc/NoMafftInstallationError.html +0 -134
- data/doc/NoPIdentError.html +0 -134
- data/doc/NoValidationError.html +0 -134
- data/doc/NotEnoughHitsError.html +0 -135
- data/doc/ORFValidationOutput.html +0 -593
- data/doc/OpenReadingFrameValidation.html +0 -1107
- data/doc/OtherError.html +0 -123
- data/doc/Output.html +0 -1540
- data/doc/Pair.html +0 -309
- data/doc/PairCluster.html +0 -767
- data/doc/Plot.html +0 -837
- data/doc/QueryError.html +0 -134
- data/doc/ReportClassError.html +0 -135
- data/doc/Sequence.html +0 -1299
- data/doc/SequenceTypeError.html +0 -135
- data/doc/TabularEntry.html +0 -837
- data/doc/TabularParser.html +0 -1104
- data/doc/Validation.html +0 -2147
- data/doc/ValidationClassError.html +0 -134
- data/doc/ValidationOutput.html +0 -460
- data/doc/ValidationReport.html +0 -940
- data/doc/ValidationTest.html +0 -939
- data/doc/_index.html +0 -449
- data/doc/class_list.html +0 -54
- data/doc/css/common.css +0 -1
- data/doc/css/full_list.css +0 -57
- data/doc/css/style.css +0 -338
- data/doc/file.README.html +0 -151
- data/doc/file_list.html +0 -56
- data/doc/frames.html +0 -26
- data/doc/index.html +0 -151
- data/doc/js/app.js +0 -214
- data/doc/js/full_list.js +0 -178
- data/doc/js/jquery.js +0 -4
- data/doc/method_list.html +0 -1505
- data/doc/top-level-namespace.html +0 -112
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
@@ -1,26 +1,39 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
require 'genevalidator/blast'
|
4
|
+
|
1
5
|
# A module to validate the command line Arguments
|
2
6
|
## CREDIT: some of these methods have been adapted from SequenceServer
|
3
7
|
module GeneValidator
|
4
8
|
# TODO: If a tabular file is provided, ensure that a tabular file has the
|
5
9
|
# right number of columns
|
6
10
|
# TODO: assert_if_ruby_version_is_supported
|
7
|
-
# A
|
8
|
-
|
11
|
+
# A class to validate the arguments passed to the Validation Class
|
12
|
+
class GVArgValidation
|
9
13
|
class << self
|
10
|
-
|
14
|
+
extend Forwardable
|
15
|
+
def_delegators GeneValidator, :opt
|
16
|
+
|
17
|
+
def validate_args
|
11
18
|
@opt = opt
|
12
19
|
assert_output_dir_does_not_exist
|
13
20
|
assert_file_present('input file', opt[:input_fasta_file])
|
14
21
|
assert_input_file_probably_fasta
|
15
|
-
|
22
|
+
assert_input_sequence
|
16
23
|
assert_BLAST_output_files
|
17
24
|
|
18
25
|
assert_validations_arg
|
19
26
|
check_num_threads
|
20
27
|
|
28
|
+
export_bin_dirs unless @opt[:bin].nil?
|
29
|
+
|
21
30
|
Blast.validate(opt) unless @opt[:test]
|
22
|
-
|
23
|
-
|
31
|
+
assert_mafft_installation
|
32
|
+
end
|
33
|
+
|
34
|
+
# Return `true` if the given command exists and is executable.
|
35
|
+
def command?(command)
|
36
|
+
system("which #{command} > /dev/null 2>&1")
|
24
37
|
end
|
25
38
|
|
26
39
|
private
|
@@ -37,10 +50,10 @@ module GeneValidator
|
|
37
50
|
def check_num_threads
|
38
51
|
@opt[:num_threads] = Integer(@opt[:num_threads])
|
39
52
|
unless @opt[:num_threads] > 0
|
40
|
-
puts 'Number of threads can not be lower than 0'
|
53
|
+
$stderr.puts 'Number of threads can not be lower than 0'
|
41
54
|
end
|
42
55
|
return unless @opt[:num_threads] > 256
|
43
|
-
puts "Number of threads set at #{@opt[:num_threads]} is unusually high."
|
56
|
+
$stderr.puts "Number of threads set at #{@opt[:num_threads]} is unusually high."
|
44
57
|
end
|
45
58
|
|
46
59
|
def assert_BLAST_output_files
|
@@ -56,18 +69,18 @@ module GeneValidator
|
|
56
69
|
def assert_output_dir_does_not_exist
|
57
70
|
output_dir = "#{@opt[:input_fasta_file]}.html"
|
58
71
|
return unless File.exist?(output_dir)
|
59
|
-
puts "The output directory already exists for this fasta file.\n"
|
60
|
-
puts "Please remove the following directory: #{output_dir}\n"
|
61
|
-
puts "You can run the following command to remove the folder.\n"
|
62
|
-
puts "\n $ rm -r #{output_dir} \n"
|
72
|
+
$stderr.puts "The output directory already exists for this fasta file.\n"
|
73
|
+
$stderr.puts "Please remove the following directory: #{output_dir}\n"
|
74
|
+
$stderr.puts "You can run the following command to remove the folder.\n"
|
75
|
+
$stderr.puts "\n $ rm -r #{output_dir} \n"
|
63
76
|
exit 1
|
64
77
|
end
|
65
78
|
|
66
79
|
def assert_tabular_options_exists
|
67
80
|
return if @opt[:blast_tabular_options]
|
68
|
-
puts '*** Error: BLAST tabular options (-o) have not been set.'
|
69
|
-
puts ' Please set the "-o" option with the custom format'
|
70
|
-
puts ' used in the BLAST -outfmt argument'
|
81
|
+
$stderr.puts '*** Error: BLAST tabular options (-o) have not been set.'
|
82
|
+
$stderr.puts ' Please set the "-o" option with the custom format'
|
83
|
+
$stderr.puts ' used in the BLAST -outfmt argument'
|
71
84
|
exit 1
|
72
85
|
end
|
73
86
|
|
@@ -79,20 +92,45 @@ module GeneValidator
|
|
79
92
|
|
80
93
|
def assert_file_present(desc, file, exit_code = 1)
|
81
94
|
return if file && File.exist?(File.expand_path(file))
|
82
|
-
puts "*** Error: Couldn't find the #{desc}: #{file}."
|
95
|
+
$stderr.puts "*** Error: Couldn't find the #{desc}: #{file}."
|
83
96
|
exit exit_code
|
84
97
|
end
|
85
98
|
|
86
99
|
alias_method :assert_dir_present, :assert_file_present
|
87
100
|
|
88
|
-
def
|
101
|
+
def assert_input_sequence
|
89
102
|
fasta_content = IO.binread(@opt[:input_fasta_file])
|
90
103
|
type = BlastUtils.type_of_sequences(fasta_content)
|
91
104
|
return if type == :nucleotide || type == :protein
|
92
|
-
puts '*** Error: The input files does not contain just protein or'
|
93
|
-
puts ' nucleotide data. Please correct this and try again.'
|
105
|
+
$stderr.puts '*** Error: The input files does not contain just protein or'
|
106
|
+
$stderr.puts ' nucleotide data. Please correct this and try again.'
|
94
107
|
exit 1
|
95
108
|
end
|
109
|
+
|
110
|
+
def export_bin_dirs
|
111
|
+
@opt[:bin].each do |bin|
|
112
|
+
if File.directory?(bin)
|
113
|
+
add_to_path(bin)
|
114
|
+
else
|
115
|
+
$stderr.puts '*** The following bin directory does not exist:'
|
116
|
+
$stderr.puts " #{bin}"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
## Checks if dir is in $PATH and if not, it adds the dir to the $PATH.
|
122
|
+
def add_to_path(bin_dir)
|
123
|
+
return if ENV['PATH'].split(':').include?(bin_dir)
|
124
|
+
ENV['PATH'] = "#{bin_dir}:#{ENV['PATH']}"
|
125
|
+
end
|
126
|
+
|
127
|
+
def assert_mafft_installation
|
128
|
+
return if command?('mafft')
|
129
|
+
$stderr.puts '*** Could not find Mafft binaries.'
|
130
|
+
$stderr.puts ' Ignoring error and continuing - Please note that' \
|
131
|
+
' some validations may be skipped.'
|
132
|
+
$stderr.puts # a blank line
|
133
|
+
end
|
96
134
|
end
|
97
135
|
|
98
136
|
# Validates BLAST Installation (And BLAST databases)
|
@@ -106,40 +144,32 @@ module GeneValidator
|
|
106
144
|
EXIT_NO_BLAST_DATABASE = 4
|
107
145
|
|
108
146
|
def validate(opt)
|
109
|
-
@opt = opt
|
110
147
|
assert_blast_installation
|
111
|
-
|
112
|
-
assert_local_blast_database_exists if
|
148
|
+
warn_if_remote_database(opt[:db])
|
149
|
+
assert_local_blast_database_exists(opt[:db]) if opt[:db] !~ /remote/
|
113
150
|
end
|
114
151
|
|
115
152
|
def assert_blast_installation
|
116
153
|
# Validate BLAST installation
|
117
|
-
|
118
|
-
|
119
|
-
assert_blast_compatible
|
120
|
-
else
|
121
|
-
export_bin_dir
|
122
|
-
end
|
154
|
+
assert_blast_installed
|
155
|
+
assert_blast_compatible
|
123
156
|
end
|
124
157
|
|
125
|
-
def
|
126
|
-
return
|
127
|
-
puts
|
128
|
-
puts '
|
129
|
-
puts
|
130
|
-
puts
|
131
|
-
|
132
|
-
puts
|
133
|
-
puts # a blank line
|
134
|
-
puts " $ genevalidator -d 'swissprot -remote' Input_File"
|
135
|
-
exit 1
|
158
|
+
def warn_if_remote_database(db)
|
159
|
+
return if db !~ /remote/
|
160
|
+
$stderr.puts # a blank line
|
161
|
+
$stderr.puts 'Warning: BLAST will be carried out on remote servers.'
|
162
|
+
$stderr.puts 'This may take quite a bit of time.'
|
163
|
+
$stderr.puts 'You may want to install a local BLAST database for' \
|
164
|
+
' faster analyses.'
|
165
|
+
$stderr.puts # a blank line
|
136
166
|
end
|
137
167
|
|
138
|
-
def assert_local_blast_database_exists
|
139
|
-
return if system("blastdbcmd -db #{
|
140
|
-
puts '*** No BLAST database found at the provided path.'
|
141
|
-
puts ' Please ensure that the provided path is correct
|
142
|
-
|
168
|
+
def assert_local_blast_database_exists(db)
|
169
|
+
return if system("blastdbcmd -db #{db} -info > /dev/null 2>&1")
|
170
|
+
$stderr.puts '*** No BLAST database found at the provided path.'
|
171
|
+
$stderr.puts ' Please ensure that the provided path is correct' \
|
172
|
+
' and then try again.'
|
143
173
|
exit EXIT_NO_BLAST_DATABASE
|
144
174
|
end
|
145
175
|
|
@@ -147,77 +177,18 @@ module GeneValidator
|
|
147
177
|
|
148
178
|
def assert_blast_installed
|
149
179
|
return if GVArgValidation.command?('blastdbcmd')
|
150
|
-
puts '*** Could not find BLAST+ binaries.'
|
180
|
+
$stderr.puts '*** Could not find BLAST+ binaries.'
|
151
181
|
exit EXIT_BLAST_NOT_INSTALLED
|
152
182
|
end
|
153
183
|
|
154
184
|
def assert_blast_compatible
|
155
185
|
version = `blastdbcmd -version`.split[1]
|
156
186
|
return if version >= MINIMUM_BLAST_VERSION
|
157
|
-
puts "*** Your BLAST+ version #{version} is outdated."
|
158
|
-
puts ' GeneValidator needs NCBI BLAST+ version' \
|
159
|
-
|
187
|
+
$stderr.puts "*** Your BLAST+ version #{version} is outdated."
|
188
|
+
$stderr.puts ' GeneValidator needs NCBI BLAST+ version' \
|
189
|
+
" #{MINIMUM_BLAST_VERSION} or higher."
|
160
190
|
exit EXIT_BLAST_NOT_COMPATIBLE
|
161
191
|
end
|
162
|
-
|
163
|
-
def export_bin_dir
|
164
|
-
if File.directory?(@opt[:blast_bin])
|
165
|
-
GVArgValidation.add_to_path(@opt[:blast_bin])
|
166
|
-
else
|
167
|
-
puts '*** The provided BLAST bin directory does not exist.'
|
168
|
-
puts ' Please ensure that the provided BLAST bin directory is' \
|
169
|
-
' correct and try again.'
|
170
|
-
exit EXIT_BLAST_NOT_INSTALLED
|
171
|
-
end
|
172
|
-
end
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
# Validates Mafft installation
|
177
|
-
class Mafft
|
178
|
-
class << self
|
179
|
-
def assert_mafft_installation(opt)
|
180
|
-
@opt = opt
|
181
|
-
if @opt[:mafft_bin].nil?
|
182
|
-
assert_mafft_installed
|
183
|
-
else
|
184
|
-
export_bin_dir
|
185
|
-
end
|
186
|
-
end
|
187
|
-
|
188
|
-
private
|
189
|
-
|
190
|
-
def assert_mafft_installed
|
191
|
-
return if GVArgValidation.command?('mafft')
|
192
|
-
puts '*** Could not find Mafft binaries.'
|
193
|
-
puts ' Ignoring error and continuing - Please note that some' \
|
194
|
-
' validations may be skipped.'
|
195
|
-
puts # a blank line
|
196
|
-
end
|
197
|
-
|
198
|
-
def export_bin_dir
|
199
|
-
if File.directory?(@opt[:mafft_bin])
|
200
|
-
GVArgValidation.add_to_path(@opt[:mafft_bin])
|
201
|
-
else
|
202
|
-
puts '*** The provided Mafft bin directory does not exist.'
|
203
|
-
puts ' Ignoring error and continuing - Please note that some' \
|
204
|
-
' validations may be skipped.'
|
205
|
-
puts # a blank line
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
class << self
|
212
|
-
## Checks if dir is in $PATH and if not, it adds the dir to the $PATH.
|
213
|
-
def add_to_path(bin_dir)
|
214
|
-
return if ENV['PATH'].split(':').include?(bin_dir)
|
215
|
-
ENV['PATH'] = "#{bin_dir}:#{ENV['PATH']}"
|
216
|
-
end
|
217
|
-
|
218
|
-
# Return `true` if the given command exists and is executable.
|
219
|
-
def command?(command)
|
220
|
-
system("which #{command} > /dev/null 2>&1")
|
221
192
|
end
|
222
193
|
end
|
223
194
|
end
|
data/lib/genevalidator/blast.rb
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
-
require '
|
1
|
+
require 'bio'
|
2
|
+
require 'bio-blastxmlparser'
|
3
|
+
require 'forwardable'
|
4
|
+
|
5
|
+
require 'genevalidator/exceptions'
|
2
6
|
require 'genevalidator/hsp'
|
7
|
+
require 'genevalidator/sequences'
|
3
8
|
require 'genevalidator/output'
|
4
|
-
require 'genevalidator/exceptions'
|
5
|
-
require 'bio-blastxmlparser'
|
6
|
-
require 'net/http'
|
7
|
-
require 'open-uri'
|
8
|
-
require 'uri'
|
9
|
-
require 'io/console'
|
10
|
-
require 'yaml'
|
11
|
-
require 'bio'
|
12
9
|
|
13
10
|
module GeneValidator
|
14
11
|
# Contains methods that run BLAST and methods that analyse sequences
|
15
12
|
class BlastUtils
|
16
13
|
class << self
|
14
|
+
extend Forwardable
|
15
|
+
def_delegators GeneValidator, :opt, :config
|
16
|
+
|
17
17
|
EVALUE = 1e-5
|
18
18
|
|
19
19
|
##
|
@@ -25,7 +25,10 @@ module GeneValidator
|
|
25
25
|
# +num_threads+: The number of threads to run BLAST with.
|
26
26
|
# Output:
|
27
27
|
# String with the blast xml output
|
28
|
-
def run_blast(
|
28
|
+
def run_blast(query, db = opt[:db], seq_type = config[:type],
|
29
|
+
num_threads = opt[:num_threads])
|
30
|
+
|
31
|
+
blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
|
29
32
|
# -num_threads is not supported on remote databases
|
30
33
|
threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : ''
|
31
34
|
|
@@ -46,20 +49,26 @@ module GeneValidator
|
|
46
49
|
# +nr_hits+: max number of hits
|
47
50
|
# Output:
|
48
51
|
# XML file
|
49
|
-
def
|
50
|
-
|
52
|
+
def run_blast_on_input_file(input_file = opt[:input_fasta_file],
|
53
|
+
db = opt[:db], seq_type = config[:type],
|
54
|
+
num_threads = opt[:num_threads])
|
55
|
+
return if opt[:blast_xml_file] || opt[:blast_tabular_file]
|
56
|
+
|
57
|
+
$stderr.puts 'Running BLAST'
|
58
|
+
opt[:blast_xml_file] = input_file + '.blast_xml'
|
59
|
+
|
51
60
|
blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
|
52
61
|
# -num_threads is not supported on remote databases
|
53
|
-
threads = (opt[:db] !~ /remote/) ? "-num_threads #{
|
62
|
+
threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : ''
|
54
63
|
|
55
|
-
blastcmd = "#{blast_type} -query '#{
|
56
|
-
" -out '#{opt[:blast_xml_file]}' -db #{
|
64
|
+
blastcmd = "#{blast_type} -query '#{input_file}'" \
|
65
|
+
" -out '#{opt[:blast_xml_file]}' -db #{db} " \
|
57
66
|
" -evalue #{EVALUE} -outfmt 5 #{threads}"
|
58
67
|
|
59
68
|
`#{blastcmd}`
|
60
69
|
return unless File.zero?(opt[:blast_xml_file])
|
61
|
-
puts 'Blast failed to run on the input file. Please ensure that the'
|
62
|
-
puts 'BLAST database exists and try again'
|
70
|
+
$stderr.puts 'Blast failed to run on the input file. Please ensure that the'
|
71
|
+
$stderr.puts 'BLAST database exists and try again'
|
63
72
|
exit 1
|
64
73
|
end
|
65
74
|
|
@@ -70,9 +79,7 @@ module GeneValidator
|
|
70
79
|
# +type+: the type of the sequence: :nucleotide or :protein
|
71
80
|
# Outputs:
|
72
81
|
# Array of +Sequence+ objects corresponding to the list of hits
|
73
|
-
def parse_next(iterator, type)
|
74
|
-
fail TypeError unless iterator.is_a? Enumerator
|
75
|
-
|
82
|
+
def parse_next(iterator, type = config[:type])
|
76
83
|
hits = []
|
77
84
|
iter = iterator.next
|
78
85
|
|
@@ -85,7 +92,6 @@ module GeneValidator
|
|
85
92
|
seq.type = :protein
|
86
93
|
seq.identifier = hit.hit_id
|
87
94
|
seq.definition = hit.hit_def
|
88
|
-
# puts seq.identifier
|
89
95
|
seq.accession_no = hit.accession
|
90
96
|
|
91
97
|
# get all high-scoring segment pairs (hsp)
|
@@ -93,7 +99,7 @@ module GeneValidator
|
|
93
99
|
|
94
100
|
hit.hsps.each do |hsp|
|
95
101
|
current_hsp = Hsp.new
|
96
|
-
current_hsp.hsp_evalue = '%.0e'
|
102
|
+
current_hsp.hsp_evalue = format('%.0e', hsp.evalue)
|
97
103
|
|
98
104
|
current_hsp.hit_from = hsp.hit_from.to_i
|
99
105
|
current_hsp.hit_to = hsp.hit_to.to_i
|
@@ -102,20 +108,20 @@ module GeneValidator
|
|
102
108
|
|
103
109
|
if type == :nucleotide
|
104
110
|
current_hsp.match_query_from /= 3
|
105
|
-
current_hsp.match_query_to
|
111
|
+
current_hsp.match_query_to /= 3
|
106
112
|
current_hsp.match_query_from += 1
|
107
|
-
current_hsp.match_query_to
|
113
|
+
current_hsp.match_query_to += 1
|
108
114
|
end
|
109
115
|
|
110
116
|
current_hsp.query_reading_frame = hsp.query_frame.to_i
|
111
117
|
|
112
118
|
current_hsp.hit_alignment = hsp.hseq.to_s
|
113
|
-
if
|
119
|
+
if guess_sequence_type(current_hsp.hit_alignment) != :protein
|
114
120
|
fail SequenceTypeError
|
115
121
|
end
|
116
122
|
|
117
123
|
current_hsp.query_alignment = hsp.qseq.to_s
|
118
|
-
if
|
124
|
+
if guess_sequence_type(current_hsp.query_alignment) != :protein
|
119
125
|
fail SequenceTypeError
|
120
126
|
end
|
121
127
|
current_hsp.align_len = hsp.align_len.to_i
|
@@ -129,23 +135,35 @@ module GeneValidator
|
|
129
135
|
hits.push(seq)
|
130
136
|
end
|
131
137
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
line = error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]
|
136
|
-
$stderr.print "Type error at #{line}. Possible cause: you didn't call" \
|
137
|
-
" parse method first!\n"
|
138
|
-
exit 1
|
139
|
-
rescue SequenceTypeError => error
|
140
|
-
line = error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]
|
141
|
-
$stderr.print "Sequence Type error at #{line}. Possible cause: the" \
|
142
|
-
'blast output was not obtained against a protein' \
|
143
|
-
" database.\n"
|
138
|
+
hits
|
139
|
+
rescue SequenceTypeError => e
|
140
|
+
$stderr.puts e
|
144
141
|
exit 1
|
145
142
|
rescue StopIteration
|
146
143
|
nil
|
147
144
|
end
|
148
145
|
|
146
|
+
##
|
147
|
+
# Method copied from sequenceserver/sequencehelpers.rb
|
148
|
+
# Splits input at putative fasta definition lines (like ">adsfadsf");
|
149
|
+
# then guesses sequence type for each sequence.
|
150
|
+
# If not enough sequence to determine, returns nil.
|
151
|
+
# If 2 kinds of sequence mixed together, raises ArgumentError
|
152
|
+
# Otherwise, returns :nucleotide or :protein
|
153
|
+
# Params:
|
154
|
+
# +sequence_string+: String to validate
|
155
|
+
# Output:
|
156
|
+
# nil, :nucleotide or :protein
|
157
|
+
def type_of_sequences(fasta_format_string)
|
158
|
+
# the first sequence does not need to have a fasta definition line
|
159
|
+
sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
|
160
|
+
# get all sequence types
|
161
|
+
sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }.uniq.compact
|
162
|
+
|
163
|
+
return nil if sequence_types.empty?
|
164
|
+
return sequence_types.first if sequence_types.length == 1
|
165
|
+
end
|
166
|
+
|
149
167
|
##
|
150
168
|
# Strips all non-letter characters. guestimates sequence based on that.
|
151
169
|
# If less than 10 useable characters... returns nil
|
@@ -165,7 +183,7 @@ module GeneValidator
|
|
165
183
|
|
166
184
|
##
|
167
185
|
#
|
168
|
-
def
|
186
|
+
def guess_sequence_type_from_input_file(file = opt[:input_fasta_file])
|
169
187
|
lines = File.foreach(file).first(10)
|
170
188
|
seqs = ''
|
171
189
|
lines.each do |l|
|
@@ -173,27 +191,6 @@ module GeneValidator
|
|
173
191
|
end
|
174
192
|
guess_sequence_type(seqs)
|
175
193
|
end
|
176
|
-
|
177
|
-
##
|
178
|
-
# Method copied from sequenceserver/sequencehelpers.rb
|
179
|
-
# Splits input at putative fasta definition lines (like ">adsfadsf");
|
180
|
-
# then guesses sequence type for each sequence.
|
181
|
-
# If not enough sequence to determine, returns nil.
|
182
|
-
# If 2 kinds of sequence mixed together, raises ArgumentError
|
183
|
-
# Otherwise, returns :nucleotide or :protein
|
184
|
-
# Params:
|
185
|
-
# +sequence_string+: String to validate
|
186
|
-
# Output:
|
187
|
-
# nil, :nucleotide or :protein
|
188
|
-
def type_of_sequences(fasta_format_string)
|
189
|
-
# the first sequence does not need to have a fasta definition line
|
190
|
-
sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
|
191
|
-
# get all sequence types
|
192
|
-
sequence_types = sequences.collect { |seq| BlastUtils.guess_sequence_type(seq) }.uniq.compact
|
193
|
-
|
194
|
-
return nil if sequence_types.empty?
|
195
|
-
return sequence_types.first if sequence_types.length == 1
|
196
|
-
end
|
197
194
|
end
|
198
195
|
end
|
199
196
|
end
|