genevalidator 1.6.1 → 1.6.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/README.md +78 -30
- data/Rakefile +11 -8
- data/aux/app_template_footer.erb +1 -6
- data/aux/app_template_header.erb +12 -32
- data/aux/files/css/style.css +2 -8
- data/aux/files/js/plots.js +564 -576
- data/aux/files/js/script.js +10 -0
- data/aux/json_footer.erb +8 -0
- data/aux/json_header.erb +19 -0
- data/aux/json_query.erb +14 -0
- data/aux/template_footer.erb +9 -58
- data/aux/template_header.erb +18 -58
- data/aux/template_query.erb +8 -36
- data/bin/genevalidator +45 -32
- data/genevalidator.gemspec +11 -7
- data/lib/genevalidator.rb +75 -455
- data/lib/genevalidator/arg_validation.rb +78 -107
- data/lib/genevalidator/blast.rb +57 -60
- data/lib/genevalidator/clusterization.rb +15 -15
- data/lib/genevalidator/exceptions.rb +32 -5
- data/lib/genevalidator/get_raw_sequences.rb +70 -33
- data/lib/genevalidator/hsp.rb +1 -4
- data/lib/genevalidator/json_to_gv_results.rb +109 -0
- data/lib/genevalidator/output.rb +177 -185
- data/lib/genevalidator/pool.rb +2 -1
- data/lib/genevalidator/sequences.rb +3 -3
- data/lib/genevalidator/tabular_parser.rb +24 -18
- data/lib/genevalidator/validation.rb +279 -0
- data/lib/genevalidator/validation_alignment.rb +31 -47
- data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
- data/lib/genevalidator/validation_duplication.rb +23 -19
- data/lib/genevalidator/validation_gene_merge.rb +30 -65
- data/lib/genevalidator/validation_length_cluster.rb +14 -53
- data/lib/genevalidator/validation_length_rank.rb +10 -11
- data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
- data/lib/genevalidator/validation_report.rb +2 -5
- data/lib/genevalidator/validation_test.rb +8 -4
- data/lib/genevalidator/version.rb +1 -1
- data/test/test_all_validations.rb +51 -66
- data/test/test_blast.rb +68 -51
- data/test/test_clusterization.rb +1 -1
- data/test/test_clusterization_2d.rb +19 -13
- data/test/test_extended_array_methods.rb +1 -1
- data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
- data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
- data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
- data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
- data/test/test_sequences.rb +46 -41
- data/test/test_validation_open_reading_frame.rb +318 -202
- data/test/test_validations.rb +48 -32
- metadata +76 -102
- data/doc/AliasDuplicationError.html +0 -134
- data/doc/AlignmentValidation.html +0 -1687
- data/doc/AlignmentValidationOutput.html +0 -659
- data/doc/Blast.html +0 -1905
- data/doc/BlastRFValidationOutput.html +0 -545
- data/doc/BlastReadingFrameValidation.html +0 -370
- data/doc/BlastUtils.html +0 -875
- data/doc/ClasspathError.html +0 -134
- data/doc/Cluster.html +0 -1316
- data/doc/DuplciationValidationOutput.html +0 -564
- data/doc/DuplicationValidation.html +0 -920
- data/doc/DuplicationValidationOutput.html +0 -564
- data/doc/FileNotFoundException.html +0 -134
- data/doc/GeneMergeValidation.html +0 -935
- data/doc/GeneMergeValidationOutput.html +0 -652
- data/doc/HierarchicalClusterization.html +0 -994
- data/doc/Hsp.html +0 -1485
- data/doc/InconsistentTabularFormat.html +0 -135
- data/doc/LengthClusterValidation.html +0 -982
- data/doc/LengthClusterValidationOutput.html +0 -515
- data/doc/LengthRankValidation.html +0 -496
- data/doc/LengthRankValidationOutput.html +0 -517
- data/doc/NoInternetError.html +0 -135
- data/doc/NoMafftInstallationError.html +0 -134
- data/doc/NoPIdentError.html +0 -134
- data/doc/NoValidationError.html +0 -134
- data/doc/NotEnoughHitsError.html +0 -135
- data/doc/ORFValidationOutput.html +0 -593
- data/doc/OpenReadingFrameValidation.html +0 -1107
- data/doc/OtherError.html +0 -123
- data/doc/Output.html +0 -1540
- data/doc/Pair.html +0 -309
- data/doc/PairCluster.html +0 -767
- data/doc/Plot.html +0 -837
- data/doc/QueryError.html +0 -134
- data/doc/ReportClassError.html +0 -135
- data/doc/Sequence.html +0 -1299
- data/doc/SequenceTypeError.html +0 -135
- data/doc/TabularEntry.html +0 -837
- data/doc/TabularParser.html +0 -1104
- data/doc/Validation.html +0 -2147
- data/doc/ValidationClassError.html +0 -134
- data/doc/ValidationOutput.html +0 -460
- data/doc/ValidationReport.html +0 -940
- data/doc/ValidationTest.html +0 -939
- data/doc/_index.html +0 -449
- data/doc/class_list.html +0 -54
- data/doc/css/common.css +0 -1
- data/doc/css/full_list.css +0 -57
- data/doc/css/style.css +0 -338
- data/doc/file.README.html +0 -151
- data/doc/file_list.html +0 -56
- data/doc/frames.html +0 -26
- data/doc/index.html +0 -151
- data/doc/js/app.js +0 -214
- data/doc/js/full_list.js +0 -178
- data/doc/js/jquery.js +0 -4
- data/doc/method_list.html +0 -1505
- data/doc/top-level-namespace.html +0 -112
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
data/lib/genevalidator.rb
CHANGED
@@ -1,159 +1,94 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
require 'bio-blastxmlparser'
|
4
|
+
|
1
5
|
require 'genevalidator/arg_validation'
|
2
|
-
require 'genevalidator/get_raw_sequences'
|
3
|
-
require 'genevalidator/tabular_parser'
|
4
6
|
require 'genevalidator/blast'
|
5
|
-
require 'genevalidator/output'
|
6
7
|
require 'genevalidator/exceptions'
|
7
|
-
require 'genevalidator/
|
8
|
-
require 'genevalidator/
|
9
|
-
require 'genevalidator/
|
10
|
-
require 'genevalidator/
|
11
|
-
require 'genevalidator/validation_duplication'
|
12
|
-
require 'genevalidator/validation_open_reading_frame'
|
13
|
-
require 'genevalidator/validation_alignment'
|
14
|
-
require 'genevalidator/pool'
|
15
|
-
require 'bio-blastxmlparser'
|
16
|
-
require 'open-uri'
|
17
|
-
require 'uri'
|
18
|
-
require 'io/console'
|
19
|
-
require 'yaml'
|
20
|
-
require 'thread'
|
8
|
+
require 'genevalidator/get_raw_sequences'
|
9
|
+
require 'genevalidator/output'
|
10
|
+
require 'genevalidator/tabular_parser'
|
11
|
+
require 'genevalidator/validation'
|
21
12
|
|
22
13
|
# Top level module / namespace.
|
23
14
|
module GeneValidator
|
24
|
-
|
25
|
-
|
26
|
-
# Main Class that initalises and then runs validations.
|
27
|
-
class Validation
|
28
|
-
attr_reader :opt
|
29
|
-
attr_reader :type
|
30
|
-
attr_reader :input_fasta_file
|
31
|
-
attr_reader :html_path
|
32
|
-
attr_reader :yaml_path
|
33
|
-
attr_reader :filename
|
15
|
+
class << self
|
16
|
+
attr_accessor :opt, :config, :overview
|
34
17
|
attr_reader :raw_seq_file_index
|
35
18
|
attr_reader :raw_seq_file_load
|
36
|
-
attr_accessor :idx # current number of the querry processed
|
37
|
-
attr_reader :start_idx
|
38
19
|
# array of indexes for the start offsets of each query in the fasta file
|
39
|
-
attr_reader :
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
@
|
78
|
-
|
79
|
-
|
80
|
-
@
|
81
|
-
|
82
|
-
|
83
|
-
@
|
84
|
-
@mutex_array = Mutex.new
|
85
|
-
|
86
|
-
# global variables
|
87
|
-
@no_queries = 0
|
88
|
-
@scores = []
|
89
|
-
@good_predictions = 0
|
90
|
-
@bad_predictions = 0
|
91
|
-
@nee = 0
|
92
|
-
@no_mafft = 0
|
93
|
-
@no_internet = 0
|
94
|
-
@map_errors = Hash.new(0)
|
95
|
-
@map_running_times = Hash.new(Pair1.new(0, 0))
|
96
|
-
|
97
|
-
@type = determine_sequence_type
|
98
|
-
@query_offset_lst = index_the_input
|
99
|
-
|
100
|
-
# build the path of html folder output
|
101
|
-
dir = File.dirname(@opt[:input_fasta_file])
|
102
|
-
@filename = File.basename(@opt[:input_fasta_file])
|
103
|
-
@yaml_path = dir
|
104
|
-
@html_path = "#{opt[:input_fasta_file]}.html"
|
105
|
-
@plot_dir = "#{@html_path}/files/json"
|
106
|
-
|
107
|
-
# create 'html' directory
|
108
|
-
Dir.mkdir(@html_path)
|
109
|
-
# copy auxiliar folders to the html folder
|
110
|
-
aux = File.join(File.dirname(File.expand_path(__FILE__)), '../aux/files')
|
111
|
-
FileUtils.cp_r(aux, @html_path)
|
20
|
+
attr_reader :query_idx
|
21
|
+
attr_accessor :mutex, :mutex_html, :mutex_json, :mutex_array
|
22
|
+
|
23
|
+
def init(opt, start_idx = 1, summary = true)
|
24
|
+
$stderr.puts 'Analysing input arguments'
|
25
|
+
@opt = opt
|
26
|
+
GVArgValidation.validate_args # validates @opt
|
27
|
+
|
28
|
+
@config = {
|
29
|
+
idx: 0,
|
30
|
+
start_idx: start_idx,
|
31
|
+
summary: summary,
|
32
|
+
|
33
|
+
type: BlastUtils.guess_sequence_type_from_input_file,
|
34
|
+
filename: File.basename(@opt[:input_fasta_file]),
|
35
|
+
html_path: "#{@opt[:input_fasta_file]}.html",
|
36
|
+
json_file: File.join(File.dirname(@opt[:input_fasta_file]),
|
37
|
+
"#{File.basename(@opt[:input_fasta_file])}.json"),
|
38
|
+
plot_dir: "#{@opt[:input_fasta_file]}.html/files/json",
|
39
|
+
aux: File.expand_path(File.join(File.dirname(__FILE__), '../aux')),
|
40
|
+
|
41
|
+
json_output: [],
|
42
|
+
run_no: 0,
|
43
|
+
output_max: 2500 # max no. of queries in the output file
|
44
|
+
}
|
45
|
+
|
46
|
+
@overview = {
|
47
|
+
no_queries: 0,
|
48
|
+
scores: [],
|
49
|
+
good_scores: 0,
|
50
|
+
bad_scores: 0,
|
51
|
+
nee: 0,
|
52
|
+
no_mafft: 0,
|
53
|
+
no_internet: 0,
|
54
|
+
map_errors: Hash.new(0),
|
55
|
+
run_time: Hash.new(Pair1.new(0, 0))
|
56
|
+
}
|
57
|
+
|
58
|
+
@mutex = Mutex.new
|
59
|
+
@mutex_array = Mutex.new
|
60
|
+
@mutex_html = Mutex.new
|
61
|
+
@mutex_json = Mutex.new
|
62
|
+
create_output_folder
|
63
|
+
index_the_input
|
64
|
+
RawSequences.index_raw_seq_file if @opt[:raw_sequences]
|
112
65
|
end
|
113
66
|
|
114
67
|
##
|
115
68
|
# Parse the blast output and run validations
|
116
69
|
def run
|
117
|
-
# Run BLAST on all sequences
|
118
|
-
|
119
|
-
|
70
|
+
# Run BLAST on all sequences (generates @opt[:blast_xml_file])
|
71
|
+
# if no BLAST OUTPUT file provided...
|
120
72
|
unless @opt[:blast_xml_file] || @opt[:blast_tabular_file]
|
121
|
-
|
122
|
-
run_blast_on_each_sequence
|
123
|
-
else
|
124
|
-
# Extract raw sequences of hits
|
125
|
-
extract_raw_sequences_of_blast_hits unless @opt[:raw_sequences]
|
126
|
-
create_an_index_file_of_raw_seq_file(@opt[:raw_sequences])
|
127
|
-
# Run Validations
|
128
|
-
iterator = parse_blast_output_file
|
129
|
-
run_validations(iterator)
|
73
|
+
BlastUtils.run_blast_on_input_file
|
130
74
|
end
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
BlastUtils.guess_sequence_type_from_file(@opt[:input_fasta_file])
|
75
|
+
# Obtain fasta file of all BLAST hits
|
76
|
+
RawSequences.run unless @opt[:raw_sequences]
|
77
|
+
# Run Validations
|
78
|
+
iterator = parse_blast_output_file
|
79
|
+
(Validations.new).run_validations(iterator)
|
80
|
+
|
81
|
+
Output.write_json_file(@config[:json_output], @config[:json_file])
|
82
|
+
Output.print_footer(@overview, @config)
|
140
83
|
end
|
141
84
|
|
142
85
|
##
|
143
|
-
#
|
144
|
-
def
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
end
|
150
|
-
|
151
|
-
##
|
152
|
-
# Extracts raw sequences of all blast hits
|
153
|
-
def extract_raw_sequences_of_blast_hits
|
154
|
-
puts 'Extracting sequences within the BLAST output file from the BLAST' \
|
155
|
-
' database'
|
156
|
-
@opt[:raw_sequences] = GetRawSequences.run(@opt)
|
86
|
+
# Creates the output folder and copies the auxiliar folders to this folder
|
87
|
+
def create_output_folder(output_dir = @config[:html_path],
|
88
|
+
aux_dir = @config[:aux])
|
89
|
+
Dir.mkdir(output_dir)
|
90
|
+
aux_files = File.join(aux_dir, 'files/')
|
91
|
+
FileUtils.cp_r(aux_files, output_dir)
|
157
92
|
end
|
158
93
|
|
159
94
|
##
|
@@ -162,39 +97,8 @@ module GeneValidator
|
|
162
97
|
# start and end positions of each query.
|
163
98
|
def index_the_input
|
164
99
|
fasta_content = IO.binread(@opt[:input_fasta_file])
|
165
|
-
|
166
|
-
|
167
|
-
fasta_content = nil
|
168
|
-
offset_array
|
169
|
-
end
|
170
|
-
|
171
|
-
##
|
172
|
-
# Index the raw sequences file...
|
173
|
-
def create_an_index_file_of_raw_seq_file(raw_sequence_file)
|
174
|
-
# leave only the identifiers in the fasta description
|
175
|
-
content = File.open(raw_sequence_file, 'rb').read.gsub(/ .*/, '')
|
176
|
-
File.open(raw_sequence_file, 'w+') { |f| f.write(content) }
|
177
|
-
|
178
|
-
# index the fasta file
|
179
|
-
keys = content.scan(/>(.*)\n/).flatten
|
180
|
-
values = content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
|
181
|
-
|
182
|
-
# make an index hash
|
183
|
-
index_hash = {}
|
184
|
-
keys.each_with_index do |k, i|
|
185
|
-
start = values[i]
|
186
|
-
endf = (i == values.length - 1) ? content.length - 1 : values[i + 1]
|
187
|
-
index_hash[k] = [start, endf]
|
188
|
-
end
|
189
|
-
|
190
|
-
# create FASTA index
|
191
|
-
@raw_seq_file_index = "#{raw_sequence_file}.idx"
|
192
|
-
@raw_seq_file_load = index_hash
|
193
|
-
|
194
|
-
File.open(@raw_seq_file_index, 'w') do |f|
|
195
|
-
YAML.dump(index_hash, f)
|
196
|
-
end
|
197
|
-
content = nil
|
100
|
+
@query_idx = fasta_content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
|
101
|
+
@query_idx.push(fasta_content.length)
|
198
102
|
end
|
199
103
|
|
200
104
|
##
|
@@ -206,293 +110,9 @@ module GeneValidator
|
|
206
110
|
if @opt[:blast_xml_file]
|
207
111
|
Bio::BlastXMLParser::XmlIterator.new(@opt[:blast_xml_file]).to_enum
|
208
112
|
else
|
209
|
-
TabularParser.new
|
210
|
-
@opt[:blast_tabular_options], @type)
|
113
|
+
TabularParser.new
|
211
114
|
end
|
212
115
|
## TODO: Add a Rescue statement - e.g. if unable to create the Object...
|
213
116
|
end
|
214
|
-
|
215
|
-
##
|
216
|
-
#
|
217
|
-
def run_blast_on_each_sequence
|
218
|
-
# file seek for each query
|
219
|
-
@query_offset_lst[0..@query_offset_lst.length - 2].each_with_index do |_pos, i|
|
220
|
-
if (i + 1) >= @start_idx
|
221
|
-
start_offset = @query_offset_lst[i + 1] - @query_offset_lst[i]
|
222
|
-
end_offset = @query_offset_lst[i]
|
223
|
-
query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
|
224
|
-
|
225
|
-
# call blast with the default parameters
|
226
|
-
blast_type = (type == :protein) ? 'blastp' : 'blastx'
|
227
|
-
blast_xml_output = BlastUtils.run_blast(blast_type, query, @opt[:db],
|
228
|
-
@opt[:num_threads])
|
229
|
-
iterator = Bio::BlastXMLParser::NokogiriBlastXml.new(blast_xml_output).to_enum
|
230
|
-
run_validations(iterator)
|
231
|
-
else
|
232
|
-
@idx += 1
|
233
|
-
end
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
##
|
238
|
-
#
|
239
|
-
def run_validations(iterator)
|
240
|
-
p = Pool.new(@opt[:num_threads]) if @opt[:num_threads] > 1
|
241
|
-
|
242
|
-
while @idx + 1 < @query_offset_lst.length
|
243
|
-
prediction = get_info_on_each_query_sequence
|
244
|
-
@idx += 1
|
245
|
-
|
246
|
-
hits = parse_next_iteration(iterator, prediction)
|
247
|
-
|
248
|
-
if hits.nil?
|
249
|
-
@idx -= 1
|
250
|
-
break
|
251
|
-
end
|
252
|
-
current_idx = @idx
|
253
|
-
# the first validation should be treated separately
|
254
|
-
if current_idx == @start_idx || @opt[:num_threads] == 1
|
255
|
-
validate(prediction, hits, current_idx)
|
256
|
-
else
|
257
|
-
p.schedule(prediction, hits, current_idx) do |prediction, hits, current_idx|
|
258
|
-
validate(prediction, hits, current_idx)
|
259
|
-
end
|
260
|
-
end
|
261
|
-
end
|
262
|
-
ensure
|
263
|
-
p.shutdown if @opt[:num_threads] > 1
|
264
|
-
end
|
265
|
-
|
266
|
-
def parse_next_iteration(iterator, prediction)
|
267
|
-
iterator.next if @idx < @start_idx
|
268
|
-
if @opt[:blast_xml_file]
|
269
|
-
BlastUtils.parse_next(iterator, @type)
|
270
|
-
elsif @opt[:blast_tabular_file]
|
271
|
-
iterator.parse_next(prediction.identifier)
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
##
|
276
|
-
# get info about the query
|
277
|
-
def get_info_on_each_query_sequence
|
278
|
-
prediction = Sequence.new
|
279
|
-
start_offset = @query_offset_lst[idx + 1] - @query_offset_lst[idx]
|
280
|
-
end_offset = @query_offset_lst[idx]
|
281
|
-
query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
|
282
|
-
parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
|
283
|
-
|
284
|
-
prediction.definition = parse_query[0].gsub("\n", '')
|
285
|
-
prediction.identifier = prediction.definition.gsub(/ .*/, '')
|
286
|
-
prediction.type = @type
|
287
|
-
prediction.raw_sequence = parse_query[1].gsub("\n", '')
|
288
|
-
prediction.length_protein = prediction.raw_sequence.length
|
289
|
-
prediction.length_protein /= 3 if @type == :nucleotide
|
290
|
-
prediction
|
291
|
-
end
|
292
|
-
|
293
|
-
##
|
294
|
-
# Validate one query and create validation report
|
295
|
-
# Params:
|
296
|
-
# +prediction+: Sequence object
|
297
|
-
# +hits+: Array of +Sequence+ objects
|
298
|
-
# +idx+: the index number of the query
|
299
|
-
def validate(prediction, hits, current_idx)
|
300
|
-
query_output = do_validations(prediction, hits, current_idx)
|
301
|
-
query_output.generate_html
|
302
|
-
query_output.print_output_file_yaml
|
303
|
-
query_output.print_output_console
|
304
|
-
|
305
|
-
validations = query_output.validations
|
306
|
-
|
307
|
-
no_mafft = 0
|
308
|
-
no_internet = 0
|
309
|
-
errors = []
|
310
|
-
validations.each do |v|
|
311
|
-
unless v.errors.nil?
|
312
|
-
no_mafft += v.errors.select { |e| e == NoMafftInstallationError }.length
|
313
|
-
no_internet += v.errors.select { |e| e == NoInternetError }.length
|
314
|
-
end
|
315
|
-
errors.push(v.short_header) if v.validation == :error
|
316
|
-
end
|
317
|
-
|
318
|
-
no_evidence = validations.count { |v| v.result == :unapplicable || v.result == :warning } == validations.length
|
319
|
-
nee = (no_evidence) ? 1 : 0
|
320
|
-
|
321
|
-
good_predictions = (query_output.overall_score >= 75) ? 1 : 0
|
322
|
-
bad_predictions = (query_output.overall_score >= 75) ? 0 : 1
|
323
|
-
|
324
|
-
@mutex_array.synchronize do
|
325
|
-
@no_queries += 1
|
326
|
-
@scores.push(query_output.overall_score)
|
327
|
-
@good_predictions += good_predictions
|
328
|
-
@bad_predictions += bad_predictions
|
329
|
-
@nee += nee
|
330
|
-
@no_mafft += no_mafft
|
331
|
-
@no_internet += no_internet
|
332
|
-
errors.each { |err| @map_errors[err] += 1 }
|
333
|
-
|
334
|
-
validations.each do |v|
|
335
|
-
next if v.running_time == 0 || v.running_time.nil?
|
336
|
-
next if v.validation == :unapplicable || v.validation == :error
|
337
|
-
p = Pair1.new(@map_running_times[v.short_header].x + v.running_time, @map_running_times[v.short_header].y + 1)
|
338
|
-
@map_running_times[v.short_header] = p
|
339
|
-
end
|
340
|
-
end
|
341
|
-
query_output
|
342
|
-
end
|
343
|
-
|
344
|
-
##
|
345
|
-
# Removes identical hits
|
346
|
-
# Params:
|
347
|
-
# +prediction+: Sequence object
|
348
|
-
# +hits+: Array of +Sequence+ objects
|
349
|
-
# Output:
|
350
|
-
# new array of hit +Sequence+ objects
|
351
|
-
def remove_identical_hits(prediction, hits)
|
352
|
-
# remove the identical hits
|
353
|
-
# identical hit means 100%coverage and >99% identity
|
354
|
-
identical_hits = []
|
355
|
-
hits.each do |hit|
|
356
|
-
# check if all hsps have identity more than 99%
|
357
|
-
low_identity = hit.hsp_list.select { |hsp| hsp.pidentity.nil? || hsp.pidentity < 99 }
|
358
|
-
|
359
|
-
# check the coverage
|
360
|
-
coverage = Array.new(prediction.length_protein, 0)
|
361
|
-
hit.hsp_list.each do |hsp|
|
362
|
-
len = hsp.match_query_to - hsp.match_query_from + 1
|
363
|
-
coverage[hsp.match_query_from - 1..hsp.match_query_to - 1] = Array.new(len, 1)
|
364
|
-
end
|
365
|
-
|
366
|
-
if low_identity.length == 0 && coverage.uniq.length == 1
|
367
|
-
identical_hits.push(hit)
|
368
|
-
end
|
369
|
-
end
|
370
|
-
|
371
|
-
identical_hits.each { |hit| hits.delete(hit) }
|
372
|
-
hits
|
373
|
-
end
|
374
|
-
|
375
|
-
##
|
376
|
-
# Runs all the validations and prints the outputs given the current
|
377
|
-
# prediction query and the corresponding hits
|
378
|
-
# Params:
|
379
|
-
# +prediction+: Sequence object
|
380
|
-
# +hits+: Array of +Sequence+ objects
|
381
|
-
# +idx+: the index number of the query
|
382
|
-
# Output:
|
383
|
-
# +Output+ object
|
384
|
-
def do_validations(prediction, hits, current_idx)
|
385
|
-
begin
|
386
|
-
hits = remove_identical_hits(prediction, hits)
|
387
|
-
rescue Exception => error # NoPIdentError
|
388
|
-
end
|
389
|
-
|
390
|
-
query_output = Output.new(@mutex, @mutex_yaml, @mutex_html,
|
391
|
-
@filename, @html_path,
|
392
|
-
@yaml_path, current_idx, @start_idx)
|
393
|
-
query_output.prediction_len = prediction.length_protein
|
394
|
-
query_output.prediction_def = prediction.definition
|
395
|
-
query_output.nr_hits = hits.length
|
396
|
-
|
397
|
-
plot_path = File.join(@plot_dir, "#{@filename}_#{current_idx}")
|
398
|
-
|
399
|
-
validations = []
|
400
|
-
validations.push LengthClusterValidation.new(@type, prediction, hits,
|
401
|
-
plot_path)
|
402
|
-
validations.push LengthRankValidation.new(@type, prediction, hits)
|
403
|
-
validations.push GeneMergeValidation.new(@type, prediction, hits,
|
404
|
-
plot_path)
|
405
|
-
validations.push DuplicationValidation.new(@type, prediction, hits,
|
406
|
-
@opt[:raw_sequences],
|
407
|
-
@raw_seq_file_index,
|
408
|
-
@raw_seq_file_load, @opt[:db],
|
409
|
-
@opt[:num_threads])
|
410
|
-
validations.push BlastReadingFrameValidation.new(@type, prediction, hits)
|
411
|
-
validations.push OpenReadingFrameValidation.new(@type, prediction, hits,
|
412
|
-
plot_path)
|
413
|
-
validations.push AlignmentValidation.new(@type, prediction, hits,
|
414
|
-
plot_path, @opt[:raw_sequences],
|
415
|
-
@raw_seq_file_index,
|
416
|
-
@raw_seq_file_load,
|
417
|
-
@opt[:db], @opt[:num_threads])
|
418
|
-
|
419
|
-
validations = validations.select { |v| @opt[:validations].include? v.cli_name.downcase }
|
420
|
-
|
421
|
-
# check the class type of the elements in the list
|
422
|
-
validations.each do |v|
|
423
|
-
fail ValidationClassError unless v.is_a? ValidationTest
|
424
|
-
end
|
425
|
-
|
426
|
-
# check alias duplication
|
427
|
-
aliases = validations.map(&:cli_name)
|
428
|
-
fail AliasDuplicationError unless aliases.length == aliases.uniq.length
|
429
|
-
|
430
|
-
validations.each do |v|
|
431
|
-
v.run
|
432
|
-
fail ReportClassError unless v.validation_report.is_a? ValidationReport
|
433
|
-
end
|
434
|
-
query_output.validations = validations.map(&:validation_report)
|
435
|
-
|
436
|
-
fail NoValidationError if query_output.validations.length == 0
|
437
|
-
|
438
|
-
# compute validation score
|
439
|
-
compute_scores(query_output)
|
440
|
-
query_output
|
441
|
-
|
442
|
-
rescue ValidationClassError => error
|
443
|
-
error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
|
444
|
-
$stderr.print "Class Type error at #{error_line}." \
|
445
|
-
' Possible cause: type of one of the validations is not' \
|
446
|
-
" ValidationTest\n"
|
447
|
-
exit 1
|
448
|
-
rescue NoValidationError => error
|
449
|
-
error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
|
450
|
-
$stderr.print "Validation error at #{error_line}." \
|
451
|
-
" Possible cause: your -v arguments are not valid aliases\n"
|
452
|
-
exit 1
|
453
|
-
rescue ReportClassError => error
|
454
|
-
error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
|
455
|
-
$stderr.print "Class Type error at #{error_line}."\
|
456
|
-
' Possible cause: type of one of the validation reports' \
|
457
|
-
" returned by the 'run' method is not ValidationReport\n"
|
458
|
-
exit 1
|
459
|
-
rescue AliasDuplicationError => error
|
460
|
-
error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
|
461
|
-
$stderr.print "Alias Duplication error at #{error_line}."\
|
462
|
-
' Possible cause: At least two validations have the same' \
|
463
|
-
" CLI alias\n"
|
464
|
-
exit 1
|
465
|
-
end
|
466
|
-
|
467
|
-
def compute_scores(query_output)
|
468
|
-
validations = query_output.validations
|
469
|
-
successes = validations.map { |v| v.result == v.expected }.count(true)
|
470
|
-
|
471
|
-
fails = validations.map { |v| v.validation != :unapplicable &&
|
472
|
-
v.validation != :error &&
|
473
|
-
v.result != v.expected }.count(true)
|
474
|
-
|
475
|
-
lcv = validations.select { |v| v.class == LengthClusterValidationOutput }
|
476
|
-
lrv = validations.select { |v| v.class == LengthRankValidationOutput }
|
477
|
-
if lcv.length == 1 && lrv.length == 1
|
478
|
-
score_lcv = (lcv[0].result == lcv[0].expected)
|
479
|
-
score_lrv = (lrv[0].result == lrv[0].expected)
|
480
|
-
# if both are true this should be counted as a single success
|
481
|
-
if score_lcv == true && score_lrv == true
|
482
|
-
successes -= 1
|
483
|
-
elsif score_lcv == false && score_lrv == false
|
484
|
-
# if both are false this will be a fail
|
485
|
-
fails -= 1
|
486
|
-
else
|
487
|
-
successes -= 0.5
|
488
|
-
fails -= 0.5
|
489
|
-
end
|
490
|
-
end
|
491
|
-
|
492
|
-
query_output.successes = successes
|
493
|
-
query_output.fails = fails
|
494
|
-
total_query = successes.to_i + fails
|
495
|
-
query_output.overall_score = (successes * 100 / (total_query)).round(0)
|
496
|
-
end
|
497
117
|
end
|
498
118
|
end
|