genevalidator 1.6.1 → 1.6.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (131) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -1
  3. data/.travis.yml +2 -0
  4. data/README.md +78 -30
  5. data/Rakefile +11 -8
  6. data/aux/app_template_footer.erb +1 -6
  7. data/aux/app_template_header.erb +12 -32
  8. data/aux/files/css/style.css +2 -8
  9. data/aux/files/js/plots.js +564 -576
  10. data/aux/files/js/script.js +10 -0
  11. data/aux/json_footer.erb +8 -0
  12. data/aux/json_header.erb +19 -0
  13. data/aux/json_query.erb +14 -0
  14. data/aux/template_footer.erb +9 -58
  15. data/aux/template_header.erb +18 -58
  16. data/aux/template_query.erb +8 -36
  17. data/bin/genevalidator +45 -32
  18. data/genevalidator.gemspec +11 -7
  19. data/lib/genevalidator.rb +75 -455
  20. data/lib/genevalidator/arg_validation.rb +78 -107
  21. data/lib/genevalidator/blast.rb +57 -60
  22. data/lib/genevalidator/clusterization.rb +15 -15
  23. data/lib/genevalidator/exceptions.rb +32 -5
  24. data/lib/genevalidator/get_raw_sequences.rb +70 -33
  25. data/lib/genevalidator/hsp.rb +1 -4
  26. data/lib/genevalidator/json_to_gv_results.rb +109 -0
  27. data/lib/genevalidator/output.rb +177 -185
  28. data/lib/genevalidator/pool.rb +2 -1
  29. data/lib/genevalidator/sequences.rb +3 -3
  30. data/lib/genevalidator/tabular_parser.rb +24 -18
  31. data/lib/genevalidator/validation.rb +279 -0
  32. data/lib/genevalidator/validation_alignment.rb +31 -47
  33. data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
  34. data/lib/genevalidator/validation_duplication.rb +23 -19
  35. data/lib/genevalidator/validation_gene_merge.rb +30 -65
  36. data/lib/genevalidator/validation_length_cluster.rb +14 -53
  37. data/lib/genevalidator/validation_length_rank.rb +10 -11
  38. data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
  39. data/lib/genevalidator/validation_report.rb +2 -5
  40. data/lib/genevalidator/validation_test.rb +8 -4
  41. data/lib/genevalidator/version.rb +1 -1
  42. data/test/test_all_validations.rb +51 -66
  43. data/test/test_blast.rb +68 -51
  44. data/test/test_clusterization.rb +1 -1
  45. data/test/test_clusterization_2d.rb +19 -13
  46. data/test/test_extended_array_methods.rb +1 -1
  47. data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
  48. data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
  49. data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
  50. data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
  51. data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
  52. data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
  53. data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
  54. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
  55. data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
  56. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
  57. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
  58. data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
  59. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
  60. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
  61. data/test/test_sequences.rb +46 -41
  62. data/test/test_validation_open_reading_frame.rb +318 -202
  63. data/test/test_validations.rb +48 -32
  64. metadata +76 -102
  65. data/doc/AliasDuplicationError.html +0 -134
  66. data/doc/AlignmentValidation.html +0 -1687
  67. data/doc/AlignmentValidationOutput.html +0 -659
  68. data/doc/Blast.html +0 -1905
  69. data/doc/BlastRFValidationOutput.html +0 -545
  70. data/doc/BlastReadingFrameValidation.html +0 -370
  71. data/doc/BlastUtils.html +0 -875
  72. data/doc/ClasspathError.html +0 -134
  73. data/doc/Cluster.html +0 -1316
  74. data/doc/DuplciationValidationOutput.html +0 -564
  75. data/doc/DuplicationValidation.html +0 -920
  76. data/doc/DuplicationValidationOutput.html +0 -564
  77. data/doc/FileNotFoundException.html +0 -134
  78. data/doc/GeneMergeValidation.html +0 -935
  79. data/doc/GeneMergeValidationOutput.html +0 -652
  80. data/doc/HierarchicalClusterization.html +0 -994
  81. data/doc/Hsp.html +0 -1485
  82. data/doc/InconsistentTabularFormat.html +0 -135
  83. data/doc/LengthClusterValidation.html +0 -982
  84. data/doc/LengthClusterValidationOutput.html +0 -515
  85. data/doc/LengthRankValidation.html +0 -496
  86. data/doc/LengthRankValidationOutput.html +0 -517
  87. data/doc/NoInternetError.html +0 -135
  88. data/doc/NoMafftInstallationError.html +0 -134
  89. data/doc/NoPIdentError.html +0 -134
  90. data/doc/NoValidationError.html +0 -134
  91. data/doc/NotEnoughHitsError.html +0 -135
  92. data/doc/ORFValidationOutput.html +0 -593
  93. data/doc/OpenReadingFrameValidation.html +0 -1107
  94. data/doc/OtherError.html +0 -123
  95. data/doc/Output.html +0 -1540
  96. data/doc/Pair.html +0 -309
  97. data/doc/PairCluster.html +0 -767
  98. data/doc/Plot.html +0 -837
  99. data/doc/QueryError.html +0 -134
  100. data/doc/ReportClassError.html +0 -135
  101. data/doc/Sequence.html +0 -1299
  102. data/doc/SequenceTypeError.html +0 -135
  103. data/doc/TabularEntry.html +0 -837
  104. data/doc/TabularParser.html +0 -1104
  105. data/doc/Validation.html +0 -2147
  106. data/doc/ValidationClassError.html +0 -134
  107. data/doc/ValidationOutput.html +0 -460
  108. data/doc/ValidationReport.html +0 -940
  109. data/doc/ValidationTest.html +0 -939
  110. data/doc/_index.html +0 -449
  111. data/doc/class_list.html +0 -54
  112. data/doc/css/common.css +0 -1
  113. data/doc/css/full_list.css +0 -57
  114. data/doc/css/style.css +0 -338
  115. data/doc/file.README.html +0 -151
  116. data/doc/file_list.html +0 -56
  117. data/doc/frames.html +0 -26
  118. data/doc/index.html +0 -151
  119. data/doc/js/app.js +0 -214
  120. data/doc/js/full_list.js +0 -178
  121. data/doc/js/jquery.js +0 -4
  122. data/doc/method_list.html +0 -1505
  123. data/doc/top-level-namespace.html +0 -112
  124. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
  125. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
  126. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
  127. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
  128. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
  129. data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
  130. data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
  131. data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
@@ -1,159 +1,94 @@
1
+ require 'fileutils'
2
+
3
+ require 'bio-blastxmlparser'
4
+
1
5
  require 'genevalidator/arg_validation'
2
- require 'genevalidator/get_raw_sequences'
3
- require 'genevalidator/tabular_parser'
4
6
  require 'genevalidator/blast'
5
- require 'genevalidator/output'
6
7
  require 'genevalidator/exceptions'
7
- require 'genevalidator/validation_length_cluster'
8
- require 'genevalidator/validation_length_rank'
9
- require 'genevalidator/validation_blast_reading_frame'
10
- require 'genevalidator/validation_gene_merge'
11
- require 'genevalidator/validation_duplication'
12
- require 'genevalidator/validation_open_reading_frame'
13
- require 'genevalidator/validation_alignment'
14
- require 'genevalidator/pool'
15
- require 'bio-blastxmlparser'
16
- require 'open-uri'
17
- require 'uri'
18
- require 'io/console'
19
- require 'yaml'
20
- require 'thread'
8
+ require 'genevalidator/get_raw_sequences'
9
+ require 'genevalidator/output'
10
+ require 'genevalidator/tabular_parser'
11
+ require 'genevalidator/validation'
21
12
 
22
13
  # Top level module / namespace.
23
14
  module GeneValidator
24
- Pair1 = Struct.new(:x, :y)
25
-
26
- # Main Class that initalises and then runs validations.
27
- class Validation
28
- attr_reader :opt
29
- attr_reader :type
30
- attr_reader :input_fasta_file
31
- attr_reader :html_path
32
- attr_reader :yaml_path
33
- attr_reader :filename
15
+ class << self
16
+ attr_accessor :opt, :config, :overview
34
17
  attr_reader :raw_seq_file_index
35
18
  attr_reader :raw_seq_file_load
36
- attr_accessor :idx # current number of the querry processed
37
- attr_reader :start_idx
38
19
  # array of indexes for the start offsets of each query in the fasta file
39
- attr_reader :query_offset_lst
40
- attr_reader :overall_evaluation
41
-
42
- # global variables
43
- attr_reader :no_queries
44
- attr_reader :scores
45
- attr_reader :good_predictions
46
- attr_reader :bad_predictions
47
- attr_reader :nee
48
- attr_reader :no_mafft
49
- attr_reader :no_internet
50
- attr_reader :map_errors
51
- attr_reader :map_running_times
52
-
53
- attr_reader :threads
54
- attr_reader :mutex
55
- attr_reader :mutex_yaml
56
- attr_reader :mutex_html
57
- attr_reader :mutex_array
58
-
59
- ##
60
- # Initilizes the object
61
- # Params:
62
- # +opt+: A hash - Default Values: {validations: ['all'],
63
- # blast_tabular_file: nil, blast_tabular_options: nil, blast_xml_file: nil,
64
- # db: 'remote', raw_sequences: nil, num_threads: 1 fast: false}
65
- # +start_idx+: number of the sequence from the file to start with
66
- # +overall_evaluation+: boolean variable for printing overall evaluation
67
- def initialize(opt, start_idx = 1, overall_evaluation = true)
68
- # Validate opts
69
- @opt = GVArgValidation.validate_args(opt)
70
-
71
- puts "\nDepending on your input and your computational resources, this"\
72
- ' may take a while. Please wait...'
73
-
74
- @idx = 0
75
- @start_idx = start_idx
76
-
77
- @overall_evaluation = overall_evaluation
78
-
79
- # start a worker thread
80
- @threads = [] # used for parallelizing the validations.
81
- @mutex = Mutex.new
82
- @mutex_yaml = Mutex.new
83
- @mutex_html = Mutex.new
84
- @mutex_array = Mutex.new
85
-
86
- # global variables
87
- @no_queries = 0
88
- @scores = []
89
- @good_predictions = 0
90
- @bad_predictions = 0
91
- @nee = 0
92
- @no_mafft = 0
93
- @no_internet = 0
94
- @map_errors = Hash.new(0)
95
- @map_running_times = Hash.new(Pair1.new(0, 0))
96
-
97
- @type = determine_sequence_type
98
- @query_offset_lst = index_the_input
99
-
100
- # build the path of html folder output
101
- dir = File.dirname(@opt[:input_fasta_file])
102
- @filename = File.basename(@opt[:input_fasta_file])
103
- @yaml_path = dir
104
- @html_path = "#{opt[:input_fasta_file]}.html"
105
- @plot_dir = "#{@html_path}/files/json"
106
-
107
- # create 'html' directory
108
- Dir.mkdir(@html_path)
109
- # copy auxiliar folders to the html folder
110
- aux = File.join(File.dirname(File.expand_path(__FILE__)), '../aux/files')
111
- FileUtils.cp_r(aux, @html_path)
20
+ attr_reader :query_idx
21
+ attr_accessor :mutex, :mutex_html, :mutex_json, :mutex_array
22
+
23
+ def init(opt, start_idx = 1, summary = true)
24
+ $stderr.puts 'Analysing input arguments'
25
+ @opt = opt
26
+ GVArgValidation.validate_args # validates @opt
27
+
28
+ @config = {
29
+ idx: 0,
30
+ start_idx: start_idx,
31
+ summary: summary,
32
+
33
+ type: BlastUtils.guess_sequence_type_from_input_file,
34
+ filename: File.basename(@opt[:input_fasta_file]),
35
+ html_path: "#{@opt[:input_fasta_file]}.html",
36
+ json_file: File.join(File.dirname(@opt[:input_fasta_file]),
37
+ "#{File.basename(@opt[:input_fasta_file])}.json"),
38
+ plot_dir: "#{@opt[:input_fasta_file]}.html/files/json",
39
+ aux: File.expand_path(File.join(File.dirname(__FILE__), '../aux')),
40
+
41
+ json_output: [],
42
+ run_no: 0,
43
+ output_max: 2500 # max no. of queries in the output file
44
+ }
45
+
46
+ @overview = {
47
+ no_queries: 0,
48
+ scores: [],
49
+ good_scores: 0,
50
+ bad_scores: 0,
51
+ nee: 0,
52
+ no_mafft: 0,
53
+ no_internet: 0,
54
+ map_errors: Hash.new(0),
55
+ run_time: Hash.new(Pair1.new(0, 0))
56
+ }
57
+
58
+ @mutex = Mutex.new
59
+ @mutex_array = Mutex.new
60
+ @mutex_html = Mutex.new
61
+ @mutex_json = Mutex.new
62
+ create_output_folder
63
+ index_the_input
64
+ RawSequences.index_raw_seq_file if @opt[:raw_sequences]
112
65
  end
113
66
 
114
67
  ##
115
68
  # Parse the blast output and run validations
116
69
  def run
117
- # Run BLAST on all sequences
118
- run_blast_on_the_input_file if @opt[:fast]
119
-
70
+ # Run BLAST on all sequences (generates @opt[:blast_xml_file])
71
+ # if no BLAST OUTPUT file provided...
120
72
  unless @opt[:blast_xml_file] || @opt[:blast_tabular_file]
121
- # run BLAST on each sequence individually & then run validations
122
- run_blast_on_each_sequence
123
- else
124
- # Extract raw sequences of hits
125
- extract_raw_sequences_of_blast_hits unless @opt[:raw_sequences]
126
- create_an_index_file_of_raw_seq_file(@opt[:raw_sequences])
127
- # Run Validations
128
- iterator = parse_blast_output_file
129
- run_validations(iterator)
73
+ BlastUtils.run_blast_on_input_file
130
74
  end
131
- return unless @overall_evaluation
132
- Output.print_footer(@no_queries, @scores, @good_predictions,
133
- @bad_predictions, @nee, @no_mafft, @no_internet,
134
- @map_errors, @map_running_times, @html_path,
135
- @filename)
136
- end
137
-
138
- def determine_sequence_type
139
- BlastUtils.guess_sequence_type_from_file(@opt[:input_fasta_file])
75
+ # Obtain fasta file of all BLAST hits
76
+ RawSequences.run unless @opt[:raw_sequences]
77
+ # Run Validations
78
+ iterator = parse_blast_output_file
79
+ (Validations.new).run_validations(iterator)
80
+
81
+ Output.write_json_file(@config[:json_output], @config[:json_file])
82
+ Output.print_footer(@overview, @config)
140
83
  end
141
84
 
142
85
  ##
143
- # Runs BLAST on the input file - only run when the opt[:fast] is true
144
- def run_blast_on_the_input_file
145
- return if @opt[:blast_xml_file] || @opt[:blast_tabular_file]
146
- puts 'Running BLAST'
147
- @opt[:blast_xml_file] = @opt[:input_fasta_file] + '.blast_xml'
148
- BlastUtils.run_blast_on_file(@opt)
149
- end
150
-
151
- ##
152
- # Extracts raw sequences of all blast hits
153
- def extract_raw_sequences_of_blast_hits
154
- puts 'Extracting sequences within the BLAST output file from the BLAST' \
155
- ' database'
156
- @opt[:raw_sequences] = GetRawSequences.run(@opt)
86
+ # Creates the output folder and copies the auxiliar folders to this folder
87
+ def create_output_folder(output_dir = @config[:html_path],
88
+ aux_dir = @config[:aux])
89
+ Dir.mkdir(output_dir)
90
+ aux_files = File.join(aux_dir, 'files/')
91
+ FileUtils.cp_r(aux_files, output_dir)
157
92
  end
158
93
 
159
94
  ##
@@ -162,39 +97,8 @@ module GeneValidator
162
97
  # start and end positions of each query.
163
98
  def index_the_input
164
99
  fasta_content = IO.binread(@opt[:input_fasta_file])
165
- offset_array = fasta_content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
166
- offset_array.push(fasta_content.length)
167
- fasta_content = nil
168
- offset_array
169
- end
170
-
171
- ##
172
- # Index the raw sequences file...
173
- def create_an_index_file_of_raw_seq_file(raw_sequence_file)
174
- # leave only the identifiers in the fasta description
175
- content = File.open(raw_sequence_file, 'rb').read.gsub(/ .*/, '')
176
- File.open(raw_sequence_file, 'w+') { |f| f.write(content) }
177
-
178
- # index the fasta file
179
- keys = content.scan(/>(.*)\n/).flatten
180
- values = content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
181
-
182
- # make an index hash
183
- index_hash = {}
184
- keys.each_with_index do |k, i|
185
- start = values[i]
186
- endf = (i == values.length - 1) ? content.length - 1 : values[i + 1]
187
- index_hash[k] = [start, endf]
188
- end
189
-
190
- # create FASTA index
191
- @raw_seq_file_index = "#{raw_sequence_file}.idx"
192
- @raw_seq_file_load = index_hash
193
-
194
- File.open(@raw_seq_file_index, 'w') do |f|
195
- YAML.dump(index_hash, f)
196
- end
197
- content = nil
100
+ @query_idx = fasta_content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
101
+ @query_idx.push(fasta_content.length)
198
102
  end
199
103
 
200
104
  ##
@@ -206,293 +110,9 @@ module GeneValidator
206
110
  if @opt[:blast_xml_file]
207
111
  Bio::BlastXMLParser::XmlIterator.new(@opt[:blast_xml_file]).to_enum
208
112
  else
209
- TabularParser.new(@opt[:blast_tabular_file],
210
- @opt[:blast_tabular_options], @type)
113
+ TabularParser.new
211
114
  end
212
115
  ## TODO: Add a Rescue statement - e.g. if unable to create the Object...
213
116
  end
214
-
215
- ##
216
- #
217
- def run_blast_on_each_sequence
218
- # file seek for each query
219
- @query_offset_lst[0..@query_offset_lst.length - 2].each_with_index do |_pos, i|
220
- if (i + 1) >= @start_idx
221
- start_offset = @query_offset_lst[i + 1] - @query_offset_lst[i]
222
- end_offset = @query_offset_lst[i]
223
- query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
224
-
225
- # call blast with the default parameters
226
- blast_type = (type == :protein) ? 'blastp' : 'blastx'
227
- blast_xml_output = BlastUtils.run_blast(blast_type, query, @opt[:db],
228
- @opt[:num_threads])
229
- iterator = Bio::BlastXMLParser::NokogiriBlastXml.new(blast_xml_output).to_enum
230
- run_validations(iterator)
231
- else
232
- @idx += 1
233
- end
234
- end
235
- end
236
-
237
- ##
238
- #
239
- def run_validations(iterator)
240
- p = Pool.new(@opt[:num_threads]) if @opt[:num_threads] > 1
241
-
242
- while @idx + 1 < @query_offset_lst.length
243
- prediction = get_info_on_each_query_sequence
244
- @idx += 1
245
-
246
- hits = parse_next_iteration(iterator, prediction)
247
-
248
- if hits.nil?
249
- @idx -= 1
250
- break
251
- end
252
- current_idx = @idx
253
- # the first validation should be treated separately
254
- if current_idx == @start_idx || @opt[:num_threads] == 1
255
- validate(prediction, hits, current_idx)
256
- else
257
- p.schedule(prediction, hits, current_idx) do |prediction, hits, current_idx|
258
- validate(prediction, hits, current_idx)
259
- end
260
- end
261
- end
262
- ensure
263
- p.shutdown if @opt[:num_threads] > 1
264
- end
265
-
266
- def parse_next_iteration(iterator, prediction)
267
- iterator.next if @idx < @start_idx
268
- if @opt[:blast_xml_file]
269
- BlastUtils.parse_next(iterator, @type)
270
- elsif @opt[:blast_tabular_file]
271
- iterator.parse_next(prediction.identifier)
272
- end
273
- end
274
-
275
- ##
276
- # get info about the query
277
- def get_info_on_each_query_sequence
278
- prediction = Sequence.new
279
- start_offset = @query_offset_lst[idx + 1] - @query_offset_lst[idx]
280
- end_offset = @query_offset_lst[idx]
281
- query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
282
- parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
283
-
284
- prediction.definition = parse_query[0].gsub("\n", '')
285
- prediction.identifier = prediction.definition.gsub(/ .*/, '')
286
- prediction.type = @type
287
- prediction.raw_sequence = parse_query[1].gsub("\n", '')
288
- prediction.length_protein = prediction.raw_sequence.length
289
- prediction.length_protein /= 3 if @type == :nucleotide
290
- prediction
291
- end
292
-
293
- ##
294
- # Validate one query and create validation report
295
- # Params:
296
- # +prediction+: Sequence object
297
- # +hits+: Array of +Sequence+ objects
298
- # +idx+: the index number of the query
299
- def validate(prediction, hits, current_idx)
300
- query_output = do_validations(prediction, hits, current_idx)
301
- query_output.generate_html
302
- query_output.print_output_file_yaml
303
- query_output.print_output_console
304
-
305
- validations = query_output.validations
306
-
307
- no_mafft = 0
308
- no_internet = 0
309
- errors = []
310
- validations.each do |v|
311
- unless v.errors.nil?
312
- no_mafft += v.errors.select { |e| e == NoMafftInstallationError }.length
313
- no_internet += v.errors.select { |e| e == NoInternetError }.length
314
- end
315
- errors.push(v.short_header) if v.validation == :error
316
- end
317
-
318
- no_evidence = validations.count { |v| v.result == :unapplicable || v.result == :warning } == validations.length
319
- nee = (no_evidence) ? 1 : 0
320
-
321
- good_predictions = (query_output.overall_score >= 75) ? 1 : 0
322
- bad_predictions = (query_output.overall_score >= 75) ? 0 : 1
323
-
324
- @mutex_array.synchronize do
325
- @no_queries += 1
326
- @scores.push(query_output.overall_score)
327
- @good_predictions += good_predictions
328
- @bad_predictions += bad_predictions
329
- @nee += nee
330
- @no_mafft += no_mafft
331
- @no_internet += no_internet
332
- errors.each { |err| @map_errors[err] += 1 }
333
-
334
- validations.each do |v|
335
- next if v.running_time == 0 || v.running_time.nil?
336
- next if v.validation == :unapplicable || v.validation == :error
337
- p = Pair1.new(@map_running_times[v.short_header].x + v.running_time, @map_running_times[v.short_header].y + 1)
338
- @map_running_times[v.short_header] = p
339
- end
340
- end
341
- query_output
342
- end
343
-
344
- ##
345
- # Removes identical hits
346
- # Params:
347
- # +prediction+: Sequence object
348
- # +hits+: Array of +Sequence+ objects
349
- # Output:
350
- # new array of hit +Sequence+ objects
351
- def remove_identical_hits(prediction, hits)
352
- # remove the identical hits
353
- # identical hit means 100%coverage and >99% identity
354
- identical_hits = []
355
- hits.each do |hit|
356
- # check if all hsps have identity more than 99%
357
- low_identity = hit.hsp_list.select { |hsp| hsp.pidentity.nil? || hsp.pidentity < 99 }
358
-
359
- # check the coverage
360
- coverage = Array.new(prediction.length_protein, 0)
361
- hit.hsp_list.each do |hsp|
362
- len = hsp.match_query_to - hsp.match_query_from + 1
363
- coverage[hsp.match_query_from - 1..hsp.match_query_to - 1] = Array.new(len, 1)
364
- end
365
-
366
- if low_identity.length == 0 && coverage.uniq.length == 1
367
- identical_hits.push(hit)
368
- end
369
- end
370
-
371
- identical_hits.each { |hit| hits.delete(hit) }
372
- hits
373
- end
374
-
375
- ##
376
- # Runs all the validations and prints the outputs given the current
377
- # prediction query and the corresponding hits
378
- # Params:
379
- # +prediction+: Sequence object
380
- # +hits+: Array of +Sequence+ objects
381
- # +idx+: the index number of the query
382
- # Output:
383
- # +Output+ object
384
- def do_validations(prediction, hits, current_idx)
385
- begin
386
- hits = remove_identical_hits(prediction, hits)
387
- rescue Exception => error # NoPIdentError
388
- end
389
-
390
- query_output = Output.new(@mutex, @mutex_yaml, @mutex_html,
391
- @filename, @html_path,
392
- @yaml_path, current_idx, @start_idx)
393
- query_output.prediction_len = prediction.length_protein
394
- query_output.prediction_def = prediction.definition
395
- query_output.nr_hits = hits.length
396
-
397
- plot_path = File.join(@plot_dir, "#{@filename}_#{current_idx}")
398
-
399
- validations = []
400
- validations.push LengthClusterValidation.new(@type, prediction, hits,
401
- plot_path)
402
- validations.push LengthRankValidation.new(@type, prediction, hits)
403
- validations.push GeneMergeValidation.new(@type, prediction, hits,
404
- plot_path)
405
- validations.push DuplicationValidation.new(@type, prediction, hits,
406
- @opt[:raw_sequences],
407
- @raw_seq_file_index,
408
- @raw_seq_file_load, @opt[:db],
409
- @opt[:num_threads])
410
- validations.push BlastReadingFrameValidation.new(@type, prediction, hits)
411
- validations.push OpenReadingFrameValidation.new(@type, prediction, hits,
412
- plot_path)
413
- validations.push AlignmentValidation.new(@type, prediction, hits,
414
- plot_path, @opt[:raw_sequences],
415
- @raw_seq_file_index,
416
- @raw_seq_file_load,
417
- @opt[:db], @opt[:num_threads])
418
-
419
- validations = validations.select { |v| @opt[:validations].include? v.cli_name.downcase }
420
-
421
- # check the class type of the elements in the list
422
- validations.each do |v|
423
- fail ValidationClassError unless v.is_a? ValidationTest
424
- end
425
-
426
- # check alias duplication
427
- aliases = validations.map(&:cli_name)
428
- fail AliasDuplicationError unless aliases.length == aliases.uniq.length
429
-
430
- validations.each do |v|
431
- v.run
432
- fail ReportClassError unless v.validation_report.is_a? ValidationReport
433
- end
434
- query_output.validations = validations.map(&:validation_report)
435
-
436
- fail NoValidationError if query_output.validations.length == 0
437
-
438
- # compute validation score
439
- compute_scores(query_output)
440
- query_output
441
-
442
- rescue ValidationClassError => error
443
- error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
444
- $stderr.print "Class Type error at #{error_line}." \
445
- ' Possible cause: type of one of the validations is not' \
446
- " ValidationTest\n"
447
- exit 1
448
- rescue NoValidationError => error
449
- error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
450
- $stderr.print "Validation error at #{error_line}." \
451
- " Possible cause: your -v arguments are not valid aliases\n"
452
- exit 1
453
- rescue ReportClassError => error
454
- error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
455
- $stderr.print "Class Type error at #{error_line}."\
456
- ' Possible cause: type of one of the validation reports' \
457
- " returned by the 'run' method is not ValidationReport\n"
458
- exit 1
459
- rescue AliasDuplicationError => error
460
- error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
461
- $stderr.print "Alias Duplication error at #{error_line}."\
462
- ' Possible cause: At least two validations have the same' \
463
- " CLI alias\n"
464
- exit 1
465
- end
466
-
467
- def compute_scores(query_output)
468
- validations = query_output.validations
469
- successes = validations.map { |v| v.result == v.expected }.count(true)
470
-
471
- fails = validations.map { |v| v.validation != :unapplicable &&
472
- v.validation != :error &&
473
- v.result != v.expected }.count(true)
474
-
475
- lcv = validations.select { |v| v.class == LengthClusterValidationOutput }
476
- lrv = validations.select { |v| v.class == LengthRankValidationOutput }
477
- if lcv.length == 1 && lrv.length == 1
478
- score_lcv = (lcv[0].result == lcv[0].expected)
479
- score_lrv = (lrv[0].result == lrv[0].expected)
480
- # if both are true this should be counted as a single success
481
- if score_lcv == true && score_lrv == true
482
- successes -= 1
483
- elsif score_lcv == false && score_lrv == false
484
- # if both are false this will be a fail
485
- fails -= 1
486
- else
487
- successes -= 0.5
488
- fails -= 0.5
489
- end
490
- end
491
-
492
- query_output.successes = successes
493
- query_output.fails = fails
494
- total_query = successes.to_i + fails
495
- query_output.overall_score = (successes * 100 / (total_query)).round(0)
496
- end
497
117
  end
498
118
  end