genevalidator 1.6.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/README.md +78 -30
- data/Rakefile +11 -8
- data/aux/app_template_footer.erb +1 -6
- data/aux/app_template_header.erb +12 -32
- data/aux/files/css/style.css +2 -8
- data/aux/files/js/plots.js +564 -576
- data/aux/files/js/script.js +10 -0
- data/aux/json_footer.erb +8 -0
- data/aux/json_header.erb +19 -0
- data/aux/json_query.erb +14 -0
- data/aux/template_footer.erb +9 -58
- data/aux/template_header.erb +18 -58
- data/aux/template_query.erb +8 -36
- data/bin/genevalidator +45 -32
- data/genevalidator.gemspec +11 -7
- data/lib/genevalidator.rb +75 -455
- data/lib/genevalidator/arg_validation.rb +78 -107
- data/lib/genevalidator/blast.rb +57 -60
- data/lib/genevalidator/clusterization.rb +15 -15
- data/lib/genevalidator/exceptions.rb +32 -5
- data/lib/genevalidator/get_raw_sequences.rb +70 -33
- data/lib/genevalidator/hsp.rb +1 -4
- data/lib/genevalidator/json_to_gv_results.rb +109 -0
- data/lib/genevalidator/output.rb +177 -185
- data/lib/genevalidator/pool.rb +2 -1
- data/lib/genevalidator/sequences.rb +3 -3
- data/lib/genevalidator/tabular_parser.rb +24 -18
- data/lib/genevalidator/validation.rb +279 -0
- data/lib/genevalidator/validation_alignment.rb +31 -47
- data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
- data/lib/genevalidator/validation_duplication.rb +23 -19
- data/lib/genevalidator/validation_gene_merge.rb +30 -65
- data/lib/genevalidator/validation_length_cluster.rb +14 -53
- data/lib/genevalidator/validation_length_rank.rb +10 -11
- data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
- data/lib/genevalidator/validation_report.rb +2 -5
- data/lib/genevalidator/validation_test.rb +8 -4
- data/lib/genevalidator/version.rb +1 -1
- data/test/test_all_validations.rb +51 -66
- data/test/test_blast.rb +68 -51
- data/test/test_clusterization.rb +1 -1
- data/test/test_clusterization_2d.rb +19 -13
- data/test/test_extended_array_methods.rb +1 -1
- data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
- data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
- data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
- data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
- data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
- data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
- data/test/test_sequences.rb +46 -41
- data/test/test_validation_open_reading_frame.rb +318 -202
- data/test/test_validations.rb +48 -32
- metadata +76 -102
- data/doc/AliasDuplicationError.html +0 -134
- data/doc/AlignmentValidation.html +0 -1687
- data/doc/AlignmentValidationOutput.html +0 -659
- data/doc/Blast.html +0 -1905
- data/doc/BlastRFValidationOutput.html +0 -545
- data/doc/BlastReadingFrameValidation.html +0 -370
- data/doc/BlastUtils.html +0 -875
- data/doc/ClasspathError.html +0 -134
- data/doc/Cluster.html +0 -1316
- data/doc/DuplciationValidationOutput.html +0 -564
- data/doc/DuplicationValidation.html +0 -920
- data/doc/DuplicationValidationOutput.html +0 -564
- data/doc/FileNotFoundException.html +0 -134
- data/doc/GeneMergeValidation.html +0 -935
- data/doc/GeneMergeValidationOutput.html +0 -652
- data/doc/HierarchicalClusterization.html +0 -994
- data/doc/Hsp.html +0 -1485
- data/doc/InconsistentTabularFormat.html +0 -135
- data/doc/LengthClusterValidation.html +0 -982
- data/doc/LengthClusterValidationOutput.html +0 -515
- data/doc/LengthRankValidation.html +0 -496
- data/doc/LengthRankValidationOutput.html +0 -517
- data/doc/NoInternetError.html +0 -135
- data/doc/NoMafftInstallationError.html +0 -134
- data/doc/NoPIdentError.html +0 -134
- data/doc/NoValidationError.html +0 -134
- data/doc/NotEnoughHitsError.html +0 -135
- data/doc/ORFValidationOutput.html +0 -593
- data/doc/OpenReadingFrameValidation.html +0 -1107
- data/doc/OtherError.html +0 -123
- data/doc/Output.html +0 -1540
- data/doc/Pair.html +0 -309
- data/doc/PairCluster.html +0 -767
- data/doc/Plot.html +0 -837
- data/doc/QueryError.html +0 -134
- data/doc/ReportClassError.html +0 -135
- data/doc/Sequence.html +0 -1299
- data/doc/SequenceTypeError.html +0 -135
- data/doc/TabularEntry.html +0 -837
- data/doc/TabularParser.html +0 -1104
- data/doc/Validation.html +0 -2147
- data/doc/ValidationClassError.html +0 -134
- data/doc/ValidationOutput.html +0 -460
- data/doc/ValidationReport.html +0 -940
- data/doc/ValidationTest.html +0 -939
- data/doc/_index.html +0 -449
- data/doc/class_list.html +0 -54
- data/doc/css/common.css +0 -1
- data/doc/css/full_list.css +0 -57
- data/doc/css/style.css +0 -338
- data/doc/file.README.html +0 -151
- data/doc/file_list.html +0 -56
- data/doc/frames.html +0 -26
- data/doc/index.html +0 -151
- data/doc/js/app.js +0 -214
- data/doc/js/full_list.js +0 -178
- data/doc/js/jquery.js +0 -4
- data/doc/method_list.html +0 -1505
- data/doc/top-level-namespace.html +0 -112
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
- data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
- data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
data/lib/genevalidator.rb
CHANGED
@@ -1,159 +1,94 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
require 'bio-blastxmlparser'
|
4
|
+
|
1
5
|
require 'genevalidator/arg_validation'
|
2
|
-
require 'genevalidator/get_raw_sequences'
|
3
|
-
require 'genevalidator/tabular_parser'
|
4
6
|
require 'genevalidator/blast'
|
5
|
-
require 'genevalidator/output'
|
6
7
|
require 'genevalidator/exceptions'
|
7
|
-
require 'genevalidator/
|
8
|
-
require 'genevalidator/
|
9
|
-
require 'genevalidator/
|
10
|
-
require 'genevalidator/
|
11
|
-
require 'genevalidator/validation_duplication'
|
12
|
-
require 'genevalidator/validation_open_reading_frame'
|
13
|
-
require 'genevalidator/validation_alignment'
|
14
|
-
require 'genevalidator/pool'
|
15
|
-
require 'bio-blastxmlparser'
|
16
|
-
require 'open-uri'
|
17
|
-
require 'uri'
|
18
|
-
require 'io/console'
|
19
|
-
require 'yaml'
|
20
|
-
require 'thread'
|
8
|
+
require 'genevalidator/get_raw_sequences'
|
9
|
+
require 'genevalidator/output'
|
10
|
+
require 'genevalidator/tabular_parser'
|
11
|
+
require 'genevalidator/validation'
|
21
12
|
|
22
13
|
# Top level module / namespace.
|
23
14
|
module GeneValidator
|
24
|
-
|
25
|
-
|
26
|
-
# Main Class that initalises and then runs validations.
|
27
|
-
class Validation
|
28
|
-
attr_reader :opt
|
29
|
-
attr_reader :type
|
30
|
-
attr_reader :input_fasta_file
|
31
|
-
attr_reader :html_path
|
32
|
-
attr_reader :yaml_path
|
33
|
-
attr_reader :filename
|
15
|
+
class << self
|
16
|
+
attr_accessor :opt, :config, :overview
|
34
17
|
attr_reader :raw_seq_file_index
|
35
18
|
attr_reader :raw_seq_file_load
|
36
|
-
attr_accessor :idx # current number of the querry processed
|
37
|
-
attr_reader :start_idx
|
38
19
|
# array of indexes for the start offsets of each query in the fasta file
|
39
|
-
attr_reader :
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
@
|
78
|
-
|
79
|
-
|
80
|
-
@
|
81
|
-
|
82
|
-
|
83
|
-
@
|
84
|
-
@mutex_array = Mutex.new
|
85
|
-
|
86
|
-
# global variables
|
87
|
-
@no_queries = 0
|
88
|
-
@scores = []
|
89
|
-
@good_predictions = 0
|
90
|
-
@bad_predictions = 0
|
91
|
-
@nee = 0
|
92
|
-
@no_mafft = 0
|
93
|
-
@no_internet = 0
|
94
|
-
@map_errors = Hash.new(0)
|
95
|
-
@map_running_times = Hash.new(Pair1.new(0, 0))
|
96
|
-
|
97
|
-
@type = determine_sequence_type
|
98
|
-
@query_offset_lst = index_the_input
|
99
|
-
|
100
|
-
# build the path of html folder output
|
101
|
-
dir = File.dirname(@opt[:input_fasta_file])
|
102
|
-
@filename = File.basename(@opt[:input_fasta_file])
|
103
|
-
@yaml_path = dir
|
104
|
-
@html_path = "#{opt[:input_fasta_file]}.html"
|
105
|
-
@plot_dir = "#{@html_path}/files/json"
|
106
|
-
|
107
|
-
# create 'html' directory
|
108
|
-
Dir.mkdir(@html_path)
|
109
|
-
# copy auxiliar folders to the html folder
|
110
|
-
aux = File.join(File.dirname(File.expand_path(__FILE__)), '../aux/files')
|
111
|
-
FileUtils.cp_r(aux, @html_path)
|
20
|
+
attr_reader :query_idx
|
21
|
+
attr_accessor :mutex, :mutex_html, :mutex_json, :mutex_array
|
22
|
+
|
23
|
+
def init(opt, start_idx = 1, summary = true)
|
24
|
+
$stderr.puts 'Analysing input arguments'
|
25
|
+
@opt = opt
|
26
|
+
GVArgValidation.validate_args # validates @opt
|
27
|
+
|
28
|
+
@config = {
|
29
|
+
idx: 0,
|
30
|
+
start_idx: start_idx,
|
31
|
+
summary: summary,
|
32
|
+
|
33
|
+
type: BlastUtils.guess_sequence_type_from_input_file,
|
34
|
+
filename: File.basename(@opt[:input_fasta_file]),
|
35
|
+
html_path: "#{@opt[:input_fasta_file]}.html",
|
36
|
+
json_file: File.join(File.dirname(@opt[:input_fasta_file]),
|
37
|
+
"#{File.basename(@opt[:input_fasta_file])}.json"),
|
38
|
+
plot_dir: "#{@opt[:input_fasta_file]}.html/files/json",
|
39
|
+
aux: File.expand_path(File.join(File.dirname(__FILE__), '../aux')),
|
40
|
+
|
41
|
+
json_output: [],
|
42
|
+
run_no: 0,
|
43
|
+
output_max: 2500 # max no. of queries in the output file
|
44
|
+
}
|
45
|
+
|
46
|
+
@overview = {
|
47
|
+
no_queries: 0,
|
48
|
+
scores: [],
|
49
|
+
good_scores: 0,
|
50
|
+
bad_scores: 0,
|
51
|
+
nee: 0,
|
52
|
+
no_mafft: 0,
|
53
|
+
no_internet: 0,
|
54
|
+
map_errors: Hash.new(0),
|
55
|
+
run_time: Hash.new(Pair1.new(0, 0))
|
56
|
+
}
|
57
|
+
|
58
|
+
@mutex = Mutex.new
|
59
|
+
@mutex_array = Mutex.new
|
60
|
+
@mutex_html = Mutex.new
|
61
|
+
@mutex_json = Mutex.new
|
62
|
+
create_output_folder
|
63
|
+
index_the_input
|
64
|
+
RawSequences.index_raw_seq_file if @opt[:raw_sequences]
|
112
65
|
end
|
113
66
|
|
114
67
|
##
|
115
68
|
# Parse the blast output and run validations
|
116
69
|
def run
|
117
|
-
# Run BLAST on all sequences
|
118
|
-
|
119
|
-
|
70
|
+
# Run BLAST on all sequences (generates @opt[:blast_xml_file])
|
71
|
+
# if no BLAST OUTPUT file provided...
|
120
72
|
unless @opt[:blast_xml_file] || @opt[:blast_tabular_file]
|
121
|
-
|
122
|
-
run_blast_on_each_sequence
|
123
|
-
else
|
124
|
-
# Extract raw sequences of hits
|
125
|
-
extract_raw_sequences_of_blast_hits unless @opt[:raw_sequences]
|
126
|
-
create_an_index_file_of_raw_seq_file(@opt[:raw_sequences])
|
127
|
-
# Run Validations
|
128
|
-
iterator = parse_blast_output_file
|
129
|
-
run_validations(iterator)
|
73
|
+
BlastUtils.run_blast_on_input_file
|
130
74
|
end
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
BlastUtils.guess_sequence_type_from_file(@opt[:input_fasta_file])
|
75
|
+
# Obtain fasta file of all BLAST hits
|
76
|
+
RawSequences.run unless @opt[:raw_sequences]
|
77
|
+
# Run Validations
|
78
|
+
iterator = parse_blast_output_file
|
79
|
+
(Validations.new).run_validations(iterator)
|
80
|
+
|
81
|
+
Output.write_json_file(@config[:json_output], @config[:json_file])
|
82
|
+
Output.print_footer(@overview, @config)
|
140
83
|
end
|
141
84
|
|
142
85
|
##
|
143
|
-
#
|
144
|
-
def
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
end
|
150
|
-
|
151
|
-
##
|
152
|
-
# Extracts raw sequences of all blast hits
|
153
|
-
def extract_raw_sequences_of_blast_hits
|
154
|
-
puts 'Extracting sequences within the BLAST output file from the BLAST' \
|
155
|
-
' database'
|
156
|
-
@opt[:raw_sequences] = GetRawSequences.run(@opt)
|
86
|
+
# Creates the output folder and copies the auxiliar folders to this folder
|
87
|
+
def create_output_folder(output_dir = @config[:html_path],
|
88
|
+
aux_dir = @config[:aux])
|
89
|
+
Dir.mkdir(output_dir)
|
90
|
+
aux_files = File.join(aux_dir, 'files/')
|
91
|
+
FileUtils.cp_r(aux_files, output_dir)
|
157
92
|
end
|
158
93
|
|
159
94
|
##
|
@@ -162,39 +97,8 @@ module GeneValidator
|
|
162
97
|
# start and end positions of each query.
|
163
98
|
def index_the_input
|
164
99
|
fasta_content = IO.binread(@opt[:input_fasta_file])
|
165
|
-
|
166
|
-
|
167
|
-
fasta_content = nil
|
168
|
-
offset_array
|
169
|
-
end
|
170
|
-
|
171
|
-
##
|
172
|
-
# Index the raw sequences file...
|
173
|
-
def create_an_index_file_of_raw_seq_file(raw_sequence_file)
|
174
|
-
# leave only the identifiers in the fasta description
|
175
|
-
content = File.open(raw_sequence_file, 'rb').read.gsub(/ .*/, '')
|
176
|
-
File.open(raw_sequence_file, 'w+') { |f| f.write(content) }
|
177
|
-
|
178
|
-
# index the fasta file
|
179
|
-
keys = content.scan(/>(.*)\n/).flatten
|
180
|
-
values = content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
|
181
|
-
|
182
|
-
# make an index hash
|
183
|
-
index_hash = {}
|
184
|
-
keys.each_with_index do |k, i|
|
185
|
-
start = values[i]
|
186
|
-
endf = (i == values.length - 1) ? content.length - 1 : values[i + 1]
|
187
|
-
index_hash[k] = [start, endf]
|
188
|
-
end
|
189
|
-
|
190
|
-
# create FASTA index
|
191
|
-
@raw_seq_file_index = "#{raw_sequence_file}.idx"
|
192
|
-
@raw_seq_file_load = index_hash
|
193
|
-
|
194
|
-
File.open(@raw_seq_file_index, 'w') do |f|
|
195
|
-
YAML.dump(index_hash, f)
|
196
|
-
end
|
197
|
-
content = nil
|
100
|
+
@query_idx = fasta_content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
|
101
|
+
@query_idx.push(fasta_content.length)
|
198
102
|
end
|
199
103
|
|
200
104
|
##
|
@@ -206,293 +110,9 @@ module GeneValidator
|
|
206
110
|
if @opt[:blast_xml_file]
|
207
111
|
Bio::BlastXMLParser::XmlIterator.new(@opt[:blast_xml_file]).to_enum
|
208
112
|
else
|
209
|
-
TabularParser.new
|
210
|
-
@opt[:blast_tabular_options], @type)
|
113
|
+
TabularParser.new
|
211
114
|
end
|
212
115
|
## TODO: Add a Rescue statement - e.g. if unable to create the Object...
|
213
116
|
end
|
214
|
-
|
215
|
-
##
|
216
|
-
#
|
217
|
-
def run_blast_on_each_sequence
|
218
|
-
# file seek for each query
|
219
|
-
@query_offset_lst[0..@query_offset_lst.length - 2].each_with_index do |_pos, i|
|
220
|
-
if (i + 1) >= @start_idx
|
221
|
-
start_offset = @query_offset_lst[i + 1] - @query_offset_lst[i]
|
222
|
-
end_offset = @query_offset_lst[i]
|
223
|
-
query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
|
224
|
-
|
225
|
-
# call blast with the default parameters
|
226
|
-
blast_type = (type == :protein) ? 'blastp' : 'blastx'
|
227
|
-
blast_xml_output = BlastUtils.run_blast(blast_type, query, @opt[:db],
|
228
|
-
@opt[:num_threads])
|
229
|
-
iterator = Bio::BlastXMLParser::NokogiriBlastXml.new(blast_xml_output).to_enum
|
230
|
-
run_validations(iterator)
|
231
|
-
else
|
232
|
-
@idx += 1
|
233
|
-
end
|
234
|
-
end
|
235
|
-
end
|
236
|
-
|
237
|
-
##
|
238
|
-
#
|
239
|
-
def run_validations(iterator)
|
240
|
-
p = Pool.new(@opt[:num_threads]) if @opt[:num_threads] > 1
|
241
|
-
|
242
|
-
while @idx + 1 < @query_offset_lst.length
|
243
|
-
prediction = get_info_on_each_query_sequence
|
244
|
-
@idx += 1
|
245
|
-
|
246
|
-
hits = parse_next_iteration(iterator, prediction)
|
247
|
-
|
248
|
-
if hits.nil?
|
249
|
-
@idx -= 1
|
250
|
-
break
|
251
|
-
end
|
252
|
-
current_idx = @idx
|
253
|
-
# the first validation should be treated separately
|
254
|
-
if current_idx == @start_idx || @opt[:num_threads] == 1
|
255
|
-
validate(prediction, hits, current_idx)
|
256
|
-
else
|
257
|
-
p.schedule(prediction, hits, current_idx) do |prediction, hits, current_idx|
|
258
|
-
validate(prediction, hits, current_idx)
|
259
|
-
end
|
260
|
-
end
|
261
|
-
end
|
262
|
-
ensure
|
263
|
-
p.shutdown if @opt[:num_threads] > 1
|
264
|
-
end
|
265
|
-
|
266
|
-
def parse_next_iteration(iterator, prediction)
|
267
|
-
iterator.next if @idx < @start_idx
|
268
|
-
if @opt[:blast_xml_file]
|
269
|
-
BlastUtils.parse_next(iterator, @type)
|
270
|
-
elsif @opt[:blast_tabular_file]
|
271
|
-
iterator.parse_next(prediction.identifier)
|
272
|
-
end
|
273
|
-
end
|
274
|
-
|
275
|
-
##
|
276
|
-
# get info about the query
|
277
|
-
def get_info_on_each_query_sequence
|
278
|
-
prediction = Sequence.new
|
279
|
-
start_offset = @query_offset_lst[idx + 1] - @query_offset_lst[idx]
|
280
|
-
end_offset = @query_offset_lst[idx]
|
281
|
-
query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
|
282
|
-
parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
|
283
|
-
|
284
|
-
prediction.definition = parse_query[0].gsub("\n", '')
|
285
|
-
prediction.identifier = prediction.definition.gsub(/ .*/, '')
|
286
|
-
prediction.type = @type
|
287
|
-
prediction.raw_sequence = parse_query[1].gsub("\n", '')
|
288
|
-
prediction.length_protein = prediction.raw_sequence.length
|
289
|
-
prediction.length_protein /= 3 if @type == :nucleotide
|
290
|
-
prediction
|
291
|
-
end
|
292
|
-
|
293
|
-
##
|
294
|
-
# Validate one query and create validation report
|
295
|
-
# Params:
|
296
|
-
# +prediction+: Sequence object
|
297
|
-
# +hits+: Array of +Sequence+ objects
|
298
|
-
# +idx+: the index number of the query
|
299
|
-
def validate(prediction, hits, current_idx)
|
300
|
-
query_output = do_validations(prediction, hits, current_idx)
|
301
|
-
query_output.generate_html
|
302
|
-
query_output.print_output_file_yaml
|
303
|
-
query_output.print_output_console
|
304
|
-
|
305
|
-
validations = query_output.validations
|
306
|
-
|
307
|
-
no_mafft = 0
|
308
|
-
no_internet = 0
|
309
|
-
errors = []
|
310
|
-
validations.each do |v|
|
311
|
-
unless v.errors.nil?
|
312
|
-
no_mafft += v.errors.select { |e| e == NoMafftInstallationError }.length
|
313
|
-
no_internet += v.errors.select { |e| e == NoInternetError }.length
|
314
|
-
end
|
315
|
-
errors.push(v.short_header) if v.validation == :error
|
316
|
-
end
|
317
|
-
|
318
|
-
no_evidence = validations.count { |v| v.result == :unapplicable || v.result == :warning } == validations.length
|
319
|
-
nee = (no_evidence) ? 1 : 0
|
320
|
-
|
321
|
-
good_predictions = (query_output.overall_score >= 75) ? 1 : 0
|
322
|
-
bad_predictions = (query_output.overall_score >= 75) ? 0 : 1
|
323
|
-
|
324
|
-
@mutex_array.synchronize do
|
325
|
-
@no_queries += 1
|
326
|
-
@scores.push(query_output.overall_score)
|
327
|
-
@good_predictions += good_predictions
|
328
|
-
@bad_predictions += bad_predictions
|
329
|
-
@nee += nee
|
330
|
-
@no_mafft += no_mafft
|
331
|
-
@no_internet += no_internet
|
332
|
-
errors.each { |err| @map_errors[err] += 1 }
|
333
|
-
|
334
|
-
validations.each do |v|
|
335
|
-
next if v.running_time == 0 || v.running_time.nil?
|
336
|
-
next if v.validation == :unapplicable || v.validation == :error
|
337
|
-
p = Pair1.new(@map_running_times[v.short_header].x + v.running_time, @map_running_times[v.short_header].y + 1)
|
338
|
-
@map_running_times[v.short_header] = p
|
339
|
-
end
|
340
|
-
end
|
341
|
-
query_output
|
342
|
-
end
|
343
|
-
|
344
|
-
##
|
345
|
-
# Removes identical hits
|
346
|
-
# Params:
|
347
|
-
# +prediction+: Sequence object
|
348
|
-
# +hits+: Array of +Sequence+ objects
|
349
|
-
# Output:
|
350
|
-
# new array of hit +Sequence+ objects
|
351
|
-
def remove_identical_hits(prediction, hits)
|
352
|
-
# remove the identical hits
|
353
|
-
# identical hit means 100%coverage and >99% identity
|
354
|
-
identical_hits = []
|
355
|
-
hits.each do |hit|
|
356
|
-
# check if all hsps have identity more than 99%
|
357
|
-
low_identity = hit.hsp_list.select { |hsp| hsp.pidentity.nil? || hsp.pidentity < 99 }
|
358
|
-
|
359
|
-
# check the coverage
|
360
|
-
coverage = Array.new(prediction.length_protein, 0)
|
361
|
-
hit.hsp_list.each do |hsp|
|
362
|
-
len = hsp.match_query_to - hsp.match_query_from + 1
|
363
|
-
coverage[hsp.match_query_from - 1..hsp.match_query_to - 1] = Array.new(len, 1)
|
364
|
-
end
|
365
|
-
|
366
|
-
if low_identity.length == 0 && coverage.uniq.length == 1
|
367
|
-
identical_hits.push(hit)
|
368
|
-
end
|
369
|
-
end
|
370
|
-
|
371
|
-
identical_hits.each { |hit| hits.delete(hit) }
|
372
|
-
hits
|
373
|
-
end
|
374
|
-
|
375
|
-
##
|
376
|
-
# Runs all the validations and prints the outputs given the current
|
377
|
-
# prediction query and the corresponding hits
|
378
|
-
# Params:
|
379
|
-
# +prediction+: Sequence object
|
380
|
-
# +hits+: Array of +Sequence+ objects
|
381
|
-
# +idx+: the index number of the query
|
382
|
-
# Output:
|
383
|
-
# +Output+ object
|
384
|
-
def do_validations(prediction, hits, current_idx)
|
385
|
-
begin
|
386
|
-
hits = remove_identical_hits(prediction, hits)
|
387
|
-
rescue Exception => error # NoPIdentError
|
388
|
-
end
|
389
|
-
|
390
|
-
query_output = Output.new(@mutex, @mutex_yaml, @mutex_html,
|
391
|
-
@filename, @html_path,
|
392
|
-
@yaml_path, current_idx, @start_idx)
|
393
|
-
query_output.prediction_len = prediction.length_protein
|
394
|
-
query_output.prediction_def = prediction.definition
|
395
|
-
query_output.nr_hits = hits.length
|
396
|
-
|
397
|
-
plot_path = File.join(@plot_dir, "#{@filename}_#{current_idx}")
|
398
|
-
|
399
|
-
validations = []
|
400
|
-
validations.push LengthClusterValidation.new(@type, prediction, hits,
|
401
|
-
plot_path)
|
402
|
-
validations.push LengthRankValidation.new(@type, prediction, hits)
|
403
|
-
validations.push GeneMergeValidation.new(@type, prediction, hits,
|
404
|
-
plot_path)
|
405
|
-
validations.push DuplicationValidation.new(@type, prediction, hits,
|
406
|
-
@opt[:raw_sequences],
|
407
|
-
@raw_seq_file_index,
|
408
|
-
@raw_seq_file_load, @opt[:db],
|
409
|
-
@opt[:num_threads])
|
410
|
-
validations.push BlastReadingFrameValidation.new(@type, prediction, hits)
|
411
|
-
validations.push OpenReadingFrameValidation.new(@type, prediction, hits,
|
412
|
-
plot_path)
|
413
|
-
validations.push AlignmentValidation.new(@type, prediction, hits,
|
414
|
-
plot_path, @opt[:raw_sequences],
|
415
|
-
@raw_seq_file_index,
|
416
|
-
@raw_seq_file_load,
|
417
|
-
@opt[:db], @opt[:num_threads])
|
418
|
-
|
419
|
-
validations = validations.select { |v| @opt[:validations].include? v.cli_name.downcase }
|
420
|
-
|
421
|
-
# check the class type of the elements in the list
|
422
|
-
validations.each do |v|
|
423
|
-
fail ValidationClassError unless v.is_a? ValidationTest
|
424
|
-
end
|
425
|
-
|
426
|
-
# check alias duplication
|
427
|
-
aliases = validations.map(&:cli_name)
|
428
|
-
fail AliasDuplicationError unless aliases.length == aliases.uniq.length
|
429
|
-
|
430
|
-
validations.each do |v|
|
431
|
-
v.run
|
432
|
-
fail ReportClassError unless v.validation_report.is_a? ValidationReport
|
433
|
-
end
|
434
|
-
query_output.validations = validations.map(&:validation_report)
|
435
|
-
|
436
|
-
fail NoValidationError if query_output.validations.length == 0
|
437
|
-
|
438
|
-
# compute validation score
|
439
|
-
compute_scores(query_output)
|
440
|
-
query_output
|
441
|
-
|
442
|
-
rescue ValidationClassError => error
|
443
|
-
error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
|
444
|
-
$stderr.print "Class Type error at #{error_line}." \
|
445
|
-
' Possible cause: type of one of the validations is not' \
|
446
|
-
" ValidationTest\n"
|
447
|
-
exit 1
|
448
|
-
rescue NoValidationError => error
|
449
|
-
error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
|
450
|
-
$stderr.print "Validation error at #{error_line}." \
|
451
|
-
" Possible cause: your -v arguments are not valid aliases\n"
|
452
|
-
exit 1
|
453
|
-
rescue ReportClassError => error
|
454
|
-
error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
|
455
|
-
$stderr.print "Class Type error at #{error_line}."\
|
456
|
-
' Possible cause: type of one of the validation reports' \
|
457
|
-
" returned by the 'run' method is not ValidationReport\n"
|
458
|
-
exit 1
|
459
|
-
rescue AliasDuplicationError => error
|
460
|
-
error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
|
461
|
-
$stderr.print "Alias Duplication error at #{error_line}."\
|
462
|
-
' Possible cause: At least two validations have the same' \
|
463
|
-
" CLI alias\n"
|
464
|
-
exit 1
|
465
|
-
end
|
466
|
-
|
467
|
-
def compute_scores(query_output)
|
468
|
-
validations = query_output.validations
|
469
|
-
successes = validations.map { |v| v.result == v.expected }.count(true)
|
470
|
-
|
471
|
-
fails = validations.map { |v| v.validation != :unapplicable &&
|
472
|
-
v.validation != :error &&
|
473
|
-
v.result != v.expected }.count(true)
|
474
|
-
|
475
|
-
lcv = validations.select { |v| v.class == LengthClusterValidationOutput }
|
476
|
-
lrv = validations.select { |v| v.class == LengthRankValidationOutput }
|
477
|
-
if lcv.length == 1 && lrv.length == 1
|
478
|
-
score_lcv = (lcv[0].result == lcv[0].expected)
|
479
|
-
score_lrv = (lrv[0].result == lrv[0].expected)
|
480
|
-
# if both are true this should be counted as a single success
|
481
|
-
if score_lcv == true && score_lrv == true
|
482
|
-
successes -= 1
|
483
|
-
elsif score_lcv == false && score_lrv == false
|
484
|
-
# if both are false this will be a fail
|
485
|
-
fails -= 1
|
486
|
-
else
|
487
|
-
successes -= 0.5
|
488
|
-
fails -= 0.5
|
489
|
-
end
|
490
|
-
end
|
491
|
-
|
492
|
-
query_output.successes = successes
|
493
|
-
query_output.fails = fails
|
494
|
-
total_query = successes.to_i + fails
|
495
|
-
query_output.overall_score = (successes * 100 / (total_query)).round(0)
|
496
|
-
end
|
497
117
|
end
|
498
118
|
end
|