genevalidator 1.6.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/README.md +78 -30
- data/Rakefile +11 -8
- data/aux/app_template_footer.erb +1 -6
- data/aux/app_template_header.erb +12 -32
- data/aux/files/css/style.css +2 -8
- data/aux/files/js/plots.js +564 -576
- data/aux/files/js/script.js +10 -0
- data/aux/json_footer.erb +8 -0
- data/aux/json_header.erb +19 -0
- data/aux/json_query.erb +14 -0
- data/aux/template_footer.erb +9 -58
- data/aux/template_header.erb +18 -58
- data/aux/template_query.erb +8 -36
- data/bin/genevalidator +45 -32
- data/genevalidator.gemspec +11 -7
- data/lib/genevalidator.rb +75 -455
- data/lib/genevalidator/arg_validation.rb +78 -107
- data/lib/genevalidator/blast.rb +57 -60
- data/lib/genevalidator/clusterization.rb +15 -15
- data/lib/genevalidator/exceptions.rb +32 -5
- data/lib/genevalidator/get_raw_sequences.rb +70 -33
- data/lib/genevalidator/hsp.rb +1 -4
- data/lib/genevalidator/json_to_gv_results.rb +109 -0
- data/lib/genevalidator/output.rb +177 -185
- data/lib/genevalidator/pool.rb +2 -1
- data/lib/genevalidator/sequences.rb +3 -3
- data/lib/genevalidator/tabular_parser.rb +24 -18
- data/lib/genevalidator/validation.rb +279 -0
- data/lib/genevalidator/validation_alignment.rb +31 -47
- data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
- data/lib/genevalidator/validation_duplication.rb +23 -19
- data/lib/genevalidator/validation_gene_merge.rb +30 -65
- data/lib/genevalidator/validation_length_cluster.rb +14 -53
- data/lib/genevalidator/validation_length_rank.rb +10 -11
- data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
- data/lib/genevalidator/validation_report.rb +2 -5
- data/lib/genevalidator/validation_test.rb +8 -4
- data/lib/genevalidator/version.rb +1 -1
- data/test/test_all_validations.rb +51 -66
- data/test/test_blast.rb +68 -51
- data/test/test_clusterization.rb +1 -1
- data/test/test_clusterization_2d.rb +19 -13
- data/test/test_extended_array_methods.rb +1 -1
- data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
- data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
- data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
- data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
- data/test/test_sequences.rb +46 -41
- data/test/test_validation_open_reading_frame.rb +318 -202
- data/test/test_validations.rb +48 -32
- metadata +76 -102
- data/doc/AliasDuplicationError.html +0 -134
- data/doc/AlignmentValidation.html +0 -1687
- data/doc/AlignmentValidationOutput.html +0 -659
- data/doc/Blast.html +0 -1905
- data/doc/BlastRFValidationOutput.html +0 -545
- data/doc/BlastReadingFrameValidation.html +0 -370
- data/doc/BlastUtils.html +0 -875
- data/doc/ClasspathError.html +0 -134
- data/doc/Cluster.html +0 -1316
- data/doc/DuplciationValidationOutput.html +0 -564
- data/doc/DuplicationValidation.html +0 -920
- data/doc/DuplicationValidationOutput.html +0 -564
- data/doc/FileNotFoundException.html +0 -134
- data/doc/GeneMergeValidation.html +0 -935
- data/doc/GeneMergeValidationOutput.html +0 -652
- data/doc/HierarchicalClusterization.html +0 -994
- data/doc/Hsp.html +0 -1485
- data/doc/InconsistentTabularFormat.html +0 -135
- data/doc/LengthClusterValidation.html +0 -982
- data/doc/LengthClusterValidationOutput.html +0 -515
- data/doc/LengthRankValidation.html +0 -496
- data/doc/LengthRankValidationOutput.html +0 -517
- data/doc/NoInternetError.html +0 -135
- data/doc/NoMafftInstallationError.html +0 -134
- data/doc/NoPIdentError.html +0 -134
- data/doc/NoValidationError.html +0 -134
- data/doc/NotEnoughHitsError.html +0 -135
- data/doc/ORFValidationOutput.html +0 -593
- data/doc/OpenReadingFrameValidation.html +0 -1107
- data/doc/OtherError.html +0 -123
- data/doc/Output.html +0 -1540
- data/doc/Pair.html +0 -309
- data/doc/PairCluster.html +0 -767
- data/doc/Plot.html +0 -837
- data/doc/QueryError.html +0 -134
- data/doc/ReportClassError.html +0 -135
- data/doc/Sequence.html +0 -1299
- data/doc/SequenceTypeError.html +0 -135
- data/doc/TabularEntry.html +0 -837
- data/doc/TabularParser.html +0 -1104
- data/doc/Validation.html +0 -2147
- data/doc/ValidationClassError.html +0 -134
- data/doc/ValidationOutput.html +0 -460
- data/doc/ValidationReport.html +0 -940
- data/doc/ValidationTest.html +0 -939
- data/doc/_index.html +0 -449
- data/doc/class_list.html +0 -54
- data/doc/css/common.css +0 -1
- data/doc/css/full_list.css +0 -57
- data/doc/css/style.css +0 -338
- data/doc/file.README.html +0 -151
- data/doc/file_list.html +0 -56
- data/doc/frames.html +0 -26
- data/doc/index.html +0 -151
- data/doc/js/app.js +0 -214
- data/doc/js/full_list.js +0 -178
- data/doc/js/jquery.js +0 -4
- data/doc/method_list.html +0 -1505
- data/doc/top-level-namespace.html +0 -112
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
@@ -1,26 +1,39 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
require 'genevalidator/blast'
|
4
|
+
|
1
5
|
# A module to validate the command line Arguments
|
2
6
|
## CREDIT: some of these methods have been adapted from SequenceServer
|
3
7
|
module GeneValidator
|
4
8
|
# TODO: If a tabular file is provided, ensure that a tabular file has the
|
5
9
|
# right number of columns
|
6
10
|
# TODO: assert_if_ruby_version_is_supported
|
7
|
-
# A
|
8
|
-
|
11
|
+
# A class to validate the arguments passed to the Validation Class
|
12
|
+
class GVArgValidation
|
9
13
|
class << self
|
10
|
-
|
14
|
+
extend Forwardable
|
15
|
+
def_delegators GeneValidator, :opt
|
16
|
+
|
17
|
+
def validate_args
|
11
18
|
@opt = opt
|
12
19
|
assert_output_dir_does_not_exist
|
13
20
|
assert_file_present('input file', opt[:input_fasta_file])
|
14
21
|
assert_input_file_probably_fasta
|
15
|
-
|
22
|
+
assert_input_sequence
|
16
23
|
assert_BLAST_output_files
|
17
24
|
|
18
25
|
assert_validations_arg
|
19
26
|
check_num_threads
|
20
27
|
|
28
|
+
export_bin_dirs unless @opt[:bin].nil?
|
29
|
+
|
21
30
|
Blast.validate(opt) unless @opt[:test]
|
22
|
-
|
23
|
-
|
31
|
+
assert_mafft_installation
|
32
|
+
end
|
33
|
+
|
34
|
+
# Return `true` if the given command exists and is executable.
|
35
|
+
def command?(command)
|
36
|
+
system("which #{command} > /dev/null 2>&1")
|
24
37
|
end
|
25
38
|
|
26
39
|
private
|
@@ -37,10 +50,10 @@ module GeneValidator
|
|
37
50
|
def check_num_threads
|
38
51
|
@opt[:num_threads] = Integer(@opt[:num_threads])
|
39
52
|
unless @opt[:num_threads] > 0
|
40
|
-
puts 'Number of threads can not be lower than 0'
|
53
|
+
$stderr.puts 'Number of threads can not be lower than 0'
|
41
54
|
end
|
42
55
|
return unless @opt[:num_threads] > 256
|
43
|
-
puts "Number of threads set at #{@opt[:num_threads]} is unusually high."
|
56
|
+
$stderr.puts "Number of threads set at #{@opt[:num_threads]} is unusually high."
|
44
57
|
end
|
45
58
|
|
46
59
|
def assert_BLAST_output_files
|
@@ -56,18 +69,18 @@ module GeneValidator
|
|
56
69
|
def assert_output_dir_does_not_exist
|
57
70
|
output_dir = "#{@opt[:input_fasta_file]}.html"
|
58
71
|
return unless File.exist?(output_dir)
|
59
|
-
puts "The output directory already exists for this fasta file.\n"
|
60
|
-
puts "Please remove the following directory: #{output_dir}\n"
|
61
|
-
puts "You can run the following command to remove the folder.\n"
|
62
|
-
puts "\n $ rm -r #{output_dir} \n"
|
72
|
+
$stderr.puts "The output directory already exists for this fasta file.\n"
|
73
|
+
$stderr.puts "Please remove the following directory: #{output_dir}\n"
|
74
|
+
$stderr.puts "You can run the following command to remove the folder.\n"
|
75
|
+
$stderr.puts "\n $ rm -r #{output_dir} \n"
|
63
76
|
exit 1
|
64
77
|
end
|
65
78
|
|
66
79
|
def assert_tabular_options_exists
|
67
80
|
return if @opt[:blast_tabular_options]
|
68
|
-
puts '*** Error: BLAST tabular options (-o) have not been set.'
|
69
|
-
puts ' Please set the "-o" option with the custom format'
|
70
|
-
puts ' used in the BLAST -outfmt argument'
|
81
|
+
$stderr.puts '*** Error: BLAST tabular options (-o) have not been set.'
|
82
|
+
$stderr.puts ' Please set the "-o" option with the custom format'
|
83
|
+
$stderr.puts ' used in the BLAST -outfmt argument'
|
71
84
|
exit 1
|
72
85
|
end
|
73
86
|
|
@@ -79,20 +92,45 @@ module GeneValidator
|
|
79
92
|
|
80
93
|
def assert_file_present(desc, file, exit_code = 1)
|
81
94
|
return if file && File.exist?(File.expand_path(file))
|
82
|
-
puts "*** Error: Couldn't find the #{desc}: #{file}."
|
95
|
+
$stderr.puts "*** Error: Couldn't find the #{desc}: #{file}."
|
83
96
|
exit exit_code
|
84
97
|
end
|
85
98
|
|
86
99
|
alias_method :assert_dir_present, :assert_file_present
|
87
100
|
|
88
|
-
def
|
101
|
+
def assert_input_sequence
|
89
102
|
fasta_content = IO.binread(@opt[:input_fasta_file])
|
90
103
|
type = BlastUtils.type_of_sequences(fasta_content)
|
91
104
|
return if type == :nucleotide || type == :protein
|
92
|
-
puts '*** Error: The input files does not contain just protein or'
|
93
|
-
puts ' nucleotide data. Please correct this and try again.'
|
105
|
+
$stderr.puts '*** Error: The input files does not contain just protein or'
|
106
|
+
$stderr.puts ' nucleotide data. Please correct this and try again.'
|
94
107
|
exit 1
|
95
108
|
end
|
109
|
+
|
110
|
+
def export_bin_dirs
|
111
|
+
@opt[:bin].each do |bin|
|
112
|
+
if File.directory?(bin)
|
113
|
+
add_to_path(bin)
|
114
|
+
else
|
115
|
+
$stderr.puts '*** The following bin directory does not exist:'
|
116
|
+
$stderr.puts " #{bin}"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
## Checks if dir is in $PATH and if not, it adds the dir to the $PATH.
|
122
|
+
def add_to_path(bin_dir)
|
123
|
+
return if ENV['PATH'].split(':').include?(bin_dir)
|
124
|
+
ENV['PATH'] = "#{bin_dir}:#{ENV['PATH']}"
|
125
|
+
end
|
126
|
+
|
127
|
+
def assert_mafft_installation
|
128
|
+
return if command?('mafft')
|
129
|
+
$stderr.puts '*** Could not find Mafft binaries.'
|
130
|
+
$stderr.puts ' Ignoring error and continuing - Please note that' \
|
131
|
+
' some validations may be skipped.'
|
132
|
+
$stderr.puts # a blank line
|
133
|
+
end
|
96
134
|
end
|
97
135
|
|
98
136
|
# Validates BLAST Installation (And BLAST databases)
|
@@ -106,40 +144,32 @@ module GeneValidator
|
|
106
144
|
EXIT_NO_BLAST_DATABASE = 4
|
107
145
|
|
108
146
|
def validate(opt)
|
109
|
-
@opt = opt
|
110
147
|
assert_blast_installation
|
111
|
-
|
112
|
-
assert_local_blast_database_exists if
|
148
|
+
warn_if_remote_database(opt[:db])
|
149
|
+
assert_local_blast_database_exists(opt[:db]) if opt[:db] !~ /remote/
|
113
150
|
end
|
114
151
|
|
115
152
|
def assert_blast_installation
|
116
153
|
# Validate BLAST installation
|
117
|
-
|
118
|
-
|
119
|
-
assert_blast_compatible
|
120
|
-
else
|
121
|
-
export_bin_dir
|
122
|
-
end
|
154
|
+
assert_blast_installed
|
155
|
+
assert_blast_compatible
|
123
156
|
end
|
124
157
|
|
125
|
-
def
|
126
|
-
return
|
127
|
-
puts
|
128
|
-
puts '
|
129
|
-
puts
|
130
|
-
puts
|
131
|
-
|
132
|
-
puts
|
133
|
-
puts # a blank line
|
134
|
-
puts " $ genevalidator -d 'swissprot -remote' Input_File"
|
135
|
-
exit 1
|
158
|
+
def warn_if_remote_database(db)
|
159
|
+
return if db !~ /remote/
|
160
|
+
$stderr.puts # a blank line
|
161
|
+
$stderr.puts 'Warning: BLAST will be carried out on remote servers.'
|
162
|
+
$stderr.puts 'This may take quite a bit of time.'
|
163
|
+
$stderr.puts 'You may want to install a local BLAST database for' \
|
164
|
+
' faster analyses.'
|
165
|
+
$stderr.puts # a blank line
|
136
166
|
end
|
137
167
|
|
138
|
-
def assert_local_blast_database_exists
|
139
|
-
return if system("blastdbcmd -db #{
|
140
|
-
puts '*** No BLAST database found at the provided path.'
|
141
|
-
puts ' Please ensure that the provided path is correct
|
142
|
-
|
168
|
+
def assert_local_blast_database_exists(db)
|
169
|
+
return if system("blastdbcmd -db #{db} -info > /dev/null 2>&1")
|
170
|
+
$stderr.puts '*** No BLAST database found at the provided path.'
|
171
|
+
$stderr.puts ' Please ensure that the provided path is correct' \
|
172
|
+
' and then try again.'
|
143
173
|
exit EXIT_NO_BLAST_DATABASE
|
144
174
|
end
|
145
175
|
|
@@ -147,77 +177,18 @@ module GeneValidator
|
|
147
177
|
|
148
178
|
def assert_blast_installed
|
149
179
|
return if GVArgValidation.command?('blastdbcmd')
|
150
|
-
puts '*** Could not find BLAST+ binaries.'
|
180
|
+
$stderr.puts '*** Could not find BLAST+ binaries.'
|
151
181
|
exit EXIT_BLAST_NOT_INSTALLED
|
152
182
|
end
|
153
183
|
|
154
184
|
def assert_blast_compatible
|
155
185
|
version = `blastdbcmd -version`.split[1]
|
156
186
|
return if version >= MINIMUM_BLAST_VERSION
|
157
|
-
puts "*** Your BLAST+ version #{version} is outdated."
|
158
|
-
puts ' GeneValidator needs NCBI BLAST+ version' \
|
159
|
-
|
187
|
+
$stderr.puts "*** Your BLAST+ version #{version} is outdated."
|
188
|
+
$stderr.puts ' GeneValidator needs NCBI BLAST+ version' \
|
189
|
+
" #{MINIMUM_BLAST_VERSION} or higher."
|
160
190
|
exit EXIT_BLAST_NOT_COMPATIBLE
|
161
191
|
end
|
162
|
-
|
163
|
-
def export_bin_dir
|
164
|
-
if File.directory?(@opt[:blast_bin])
|
165
|
-
GVArgValidation.add_to_path(@opt[:blast_bin])
|
166
|
-
else
|
167
|
-
puts '*** The provided BLAST bin directory does not exist.'
|
168
|
-
puts ' Please ensure that the provided BLAST bin directory is' \
|
169
|
-
' correct and try again.'
|
170
|
-
exit EXIT_BLAST_NOT_INSTALLED
|
171
|
-
end
|
172
|
-
end
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
# Validates Mafft installation
|
177
|
-
class Mafft
|
178
|
-
class << self
|
179
|
-
def assert_mafft_installation(opt)
|
180
|
-
@opt = opt
|
181
|
-
if @opt[:mafft_bin].nil?
|
182
|
-
assert_mafft_installed
|
183
|
-
else
|
184
|
-
export_bin_dir
|
185
|
-
end
|
186
|
-
end
|
187
|
-
|
188
|
-
private
|
189
|
-
|
190
|
-
def assert_mafft_installed
|
191
|
-
return if GVArgValidation.command?('mafft')
|
192
|
-
puts '*** Could not find Mafft binaries.'
|
193
|
-
puts ' Ignoring error and continuing - Please note that some' \
|
194
|
-
' validations may be skipped.'
|
195
|
-
puts # a blank line
|
196
|
-
end
|
197
|
-
|
198
|
-
def export_bin_dir
|
199
|
-
if File.directory?(@opt[:mafft_bin])
|
200
|
-
GVArgValidation.add_to_path(@opt[:mafft_bin])
|
201
|
-
else
|
202
|
-
puts '*** The provided Mafft bin directory does not exist.'
|
203
|
-
puts ' Ignoring error and continuing - Please note that some' \
|
204
|
-
' validations may be skipped.'
|
205
|
-
puts # a blank line
|
206
|
-
end
|
207
|
-
end
|
208
|
-
end
|
209
|
-
end
|
210
|
-
|
211
|
-
class << self
|
212
|
-
## Checks if dir is in $PATH and if not, it adds the dir to the $PATH.
|
213
|
-
def add_to_path(bin_dir)
|
214
|
-
return if ENV['PATH'].split(':').include?(bin_dir)
|
215
|
-
ENV['PATH'] = "#{bin_dir}:#{ENV['PATH']}"
|
216
|
-
end
|
217
|
-
|
218
|
-
# Return `true` if the given command exists and is executable.
|
219
|
-
def command?(command)
|
220
|
-
system("which #{command} > /dev/null 2>&1")
|
221
192
|
end
|
222
193
|
end
|
223
194
|
end
|
data/lib/genevalidator/blast.rb
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
-
require '
|
1
|
+
require 'bio'
|
2
|
+
require 'bio-blastxmlparser'
|
3
|
+
require 'forwardable'
|
4
|
+
|
5
|
+
require 'genevalidator/exceptions'
|
2
6
|
require 'genevalidator/hsp'
|
7
|
+
require 'genevalidator/sequences'
|
3
8
|
require 'genevalidator/output'
|
4
|
-
require 'genevalidator/exceptions'
|
5
|
-
require 'bio-blastxmlparser'
|
6
|
-
require 'net/http'
|
7
|
-
require 'open-uri'
|
8
|
-
require 'uri'
|
9
|
-
require 'io/console'
|
10
|
-
require 'yaml'
|
11
|
-
require 'bio'
|
12
9
|
|
13
10
|
module GeneValidator
|
14
11
|
# Contains methods that run BLAST and methods that analyse sequences
|
15
12
|
class BlastUtils
|
16
13
|
class << self
|
14
|
+
extend Forwardable
|
15
|
+
def_delegators GeneValidator, :opt, :config
|
16
|
+
|
17
17
|
EVALUE = 1e-5
|
18
18
|
|
19
19
|
##
|
@@ -25,7 +25,10 @@ module GeneValidator
|
|
25
25
|
# +num_threads+: The number of threads to run BLAST with.
|
26
26
|
# Output:
|
27
27
|
# String with the blast xml output
|
28
|
-
def run_blast(
|
28
|
+
def run_blast(query, db = opt[:db], seq_type = config[:type],
|
29
|
+
num_threads = opt[:num_threads])
|
30
|
+
|
31
|
+
blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
|
29
32
|
# -num_threads is not supported on remote databases
|
30
33
|
threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : ''
|
31
34
|
|
@@ -46,20 +49,26 @@ module GeneValidator
|
|
46
49
|
# +nr_hits+: max number of hits
|
47
50
|
# Output:
|
48
51
|
# XML file
|
49
|
-
def
|
50
|
-
|
52
|
+
def run_blast_on_input_file(input_file = opt[:input_fasta_file],
|
53
|
+
db = opt[:db], seq_type = config[:type],
|
54
|
+
num_threads = opt[:num_threads])
|
55
|
+
return if opt[:blast_xml_file] || opt[:blast_tabular_file]
|
56
|
+
|
57
|
+
$stderr.puts 'Running BLAST'
|
58
|
+
opt[:blast_xml_file] = input_file + '.blast_xml'
|
59
|
+
|
51
60
|
blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
|
52
61
|
# -num_threads is not supported on remote databases
|
53
|
-
threads = (opt[:db] !~ /remote/) ? "-num_threads #{
|
62
|
+
threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : ''
|
54
63
|
|
55
|
-
blastcmd = "#{blast_type} -query '#{
|
56
|
-
" -out '#{opt[:blast_xml_file]}' -db #{
|
64
|
+
blastcmd = "#{blast_type} -query '#{input_file}'" \
|
65
|
+
" -out '#{opt[:blast_xml_file]}' -db #{db} " \
|
57
66
|
" -evalue #{EVALUE} -outfmt 5 #{threads}"
|
58
67
|
|
59
68
|
`#{blastcmd}`
|
60
69
|
return unless File.zero?(opt[:blast_xml_file])
|
61
|
-
puts 'Blast failed to run on the input file. Please ensure that the'
|
62
|
-
puts 'BLAST database exists and try again'
|
70
|
+
$stderr.puts 'Blast failed to run on the input file. Please ensure that the'
|
71
|
+
$stderr.puts 'BLAST database exists and try again'
|
63
72
|
exit 1
|
64
73
|
end
|
65
74
|
|
@@ -70,9 +79,7 @@ module GeneValidator
|
|
70
79
|
# +type+: the type of the sequence: :nucleotide or :protein
|
71
80
|
# Outputs:
|
72
81
|
# Array of +Sequence+ objects corresponding to the list of hits
|
73
|
-
def parse_next(iterator, type)
|
74
|
-
fail TypeError unless iterator.is_a? Enumerator
|
75
|
-
|
82
|
+
def parse_next(iterator, type = config[:type])
|
76
83
|
hits = []
|
77
84
|
iter = iterator.next
|
78
85
|
|
@@ -85,7 +92,6 @@ module GeneValidator
|
|
85
92
|
seq.type = :protein
|
86
93
|
seq.identifier = hit.hit_id
|
87
94
|
seq.definition = hit.hit_def
|
88
|
-
# puts seq.identifier
|
89
95
|
seq.accession_no = hit.accession
|
90
96
|
|
91
97
|
# get all high-scoring segment pairs (hsp)
|
@@ -93,7 +99,7 @@ module GeneValidator
|
|
93
99
|
|
94
100
|
hit.hsps.each do |hsp|
|
95
101
|
current_hsp = Hsp.new
|
96
|
-
current_hsp.hsp_evalue = '%.0e'
|
102
|
+
current_hsp.hsp_evalue = format('%.0e', hsp.evalue)
|
97
103
|
|
98
104
|
current_hsp.hit_from = hsp.hit_from.to_i
|
99
105
|
current_hsp.hit_to = hsp.hit_to.to_i
|
@@ -102,20 +108,20 @@ module GeneValidator
|
|
102
108
|
|
103
109
|
if type == :nucleotide
|
104
110
|
current_hsp.match_query_from /= 3
|
105
|
-
current_hsp.match_query_to
|
111
|
+
current_hsp.match_query_to /= 3
|
106
112
|
current_hsp.match_query_from += 1
|
107
|
-
current_hsp.match_query_to
|
113
|
+
current_hsp.match_query_to += 1
|
108
114
|
end
|
109
115
|
|
110
116
|
current_hsp.query_reading_frame = hsp.query_frame.to_i
|
111
117
|
|
112
118
|
current_hsp.hit_alignment = hsp.hseq.to_s
|
113
|
-
if
|
119
|
+
if guess_sequence_type(current_hsp.hit_alignment) != :protein
|
114
120
|
fail SequenceTypeError
|
115
121
|
end
|
116
122
|
|
117
123
|
current_hsp.query_alignment = hsp.qseq.to_s
|
118
|
-
if
|
124
|
+
if guess_sequence_type(current_hsp.query_alignment) != :protein
|
119
125
|
fail SequenceTypeError
|
120
126
|
end
|
121
127
|
current_hsp.align_len = hsp.align_len.to_i
|
@@ -129,23 +135,35 @@ module GeneValidator
|
|
129
135
|
hits.push(seq)
|
130
136
|
end
|
131
137
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
line = error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]
|
136
|
-
$stderr.print "Type error at #{line}. Possible cause: you didn't call" \
|
137
|
-
" parse method first!\n"
|
138
|
-
exit 1
|
139
|
-
rescue SequenceTypeError => error
|
140
|
-
line = error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]
|
141
|
-
$stderr.print "Sequence Type error at #{line}. Possible cause: the" \
|
142
|
-
'blast output was not obtained against a protein' \
|
143
|
-
" database.\n"
|
138
|
+
hits
|
139
|
+
rescue SequenceTypeError => e
|
140
|
+
$stderr.puts e
|
144
141
|
exit 1
|
145
142
|
rescue StopIteration
|
146
143
|
nil
|
147
144
|
end
|
148
145
|
|
146
|
+
##
|
147
|
+
# Method copied from sequenceserver/sequencehelpers.rb
|
148
|
+
# Splits input at putative fasta definition lines (like ">adsfadsf");
|
149
|
+
# then guesses sequence type for each sequence.
|
150
|
+
# If not enough sequence to determine, returns nil.
|
151
|
+
# If 2 kinds of sequence mixed together, raises ArgumentError
|
152
|
+
# Otherwise, returns :nucleotide or :protein
|
153
|
+
# Params:
|
154
|
+
# +sequence_string+: String to validate
|
155
|
+
# Output:
|
156
|
+
# nil, :nucleotide or :protein
|
157
|
+
def type_of_sequences(fasta_format_string)
|
158
|
+
# the first sequence does not need to have a fasta definition line
|
159
|
+
sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
|
160
|
+
# get all sequence types
|
161
|
+
sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }.uniq.compact
|
162
|
+
|
163
|
+
return nil if sequence_types.empty?
|
164
|
+
return sequence_types.first if sequence_types.length == 1
|
165
|
+
end
|
166
|
+
|
149
167
|
##
|
150
168
|
# Strips all non-letter characters. guestimates sequence based on that.
|
151
169
|
# If less than 10 useable characters... returns nil
|
@@ -165,7 +183,7 @@ module GeneValidator
|
|
165
183
|
|
166
184
|
##
|
167
185
|
#
|
168
|
-
def
|
186
|
+
def guess_sequence_type_from_input_file(file = opt[:input_fasta_file])
|
169
187
|
lines = File.foreach(file).first(10)
|
170
188
|
seqs = ''
|
171
189
|
lines.each do |l|
|
@@ -173,27 +191,6 @@ module GeneValidator
|
|
173
191
|
end
|
174
192
|
guess_sequence_type(seqs)
|
175
193
|
end
|
176
|
-
|
177
|
-
##
|
178
|
-
# Method copied from sequenceserver/sequencehelpers.rb
|
179
|
-
# Splits input at putative fasta definition lines (like ">adsfadsf");
|
180
|
-
# then guesses sequence type for each sequence.
|
181
|
-
# If not enough sequence to determine, returns nil.
|
182
|
-
# If 2 kinds of sequence mixed together, raises ArgumentError
|
183
|
-
# Otherwise, returns :nucleotide or :protein
|
184
|
-
# Params:
|
185
|
-
# +sequence_string+: String to validate
|
186
|
-
# Output:
|
187
|
-
# nil, :nucleotide or :protein
|
188
|
-
def type_of_sequences(fasta_format_string)
|
189
|
-
# the first sequence does not need to have a fasta definition line
|
190
|
-
sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
|
191
|
-
# get all sequence types
|
192
|
-
sequence_types = sequences.collect { |seq| BlastUtils.guess_sequence_type(seq) }.uniq.compact
|
193
|
-
|
194
|
-
return nil if sequence_types.empty?
|
195
|
-
return sequence_types.first if sequence_types.length == 1
|
196
|
-
end
|
197
194
|
end
|
198
195
|
end
|
199
196
|
end
|