genevalidator 1.6.1 → 1.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -1
  3. data/.travis.yml +2 -0
  4. data/README.md +78 -30
  5. data/Rakefile +11 -8
  6. data/aux/app_template_footer.erb +1 -6
  7. data/aux/app_template_header.erb +12 -32
  8. data/aux/files/css/style.css +2 -8
  9. data/aux/files/js/plots.js +564 -576
  10. data/aux/files/js/script.js +10 -0
  11. data/aux/json_footer.erb +8 -0
  12. data/aux/json_header.erb +19 -0
  13. data/aux/json_query.erb +14 -0
  14. data/aux/template_footer.erb +9 -58
  15. data/aux/template_header.erb +18 -58
  16. data/aux/template_query.erb +8 -36
  17. data/bin/genevalidator +45 -32
  18. data/genevalidator.gemspec +11 -7
  19. data/lib/genevalidator.rb +75 -455
  20. data/lib/genevalidator/arg_validation.rb +78 -107
  21. data/lib/genevalidator/blast.rb +57 -60
  22. data/lib/genevalidator/clusterization.rb +15 -15
  23. data/lib/genevalidator/exceptions.rb +32 -5
  24. data/lib/genevalidator/get_raw_sequences.rb +70 -33
  25. data/lib/genevalidator/hsp.rb +1 -4
  26. data/lib/genevalidator/json_to_gv_results.rb +109 -0
  27. data/lib/genevalidator/output.rb +177 -185
  28. data/lib/genevalidator/pool.rb +2 -1
  29. data/lib/genevalidator/sequences.rb +3 -3
  30. data/lib/genevalidator/tabular_parser.rb +24 -18
  31. data/lib/genevalidator/validation.rb +279 -0
  32. data/lib/genevalidator/validation_alignment.rb +31 -47
  33. data/lib/genevalidator/validation_blast_reading_frame.rb +19 -18
  34. data/lib/genevalidator/validation_duplication.rb +23 -19
  35. data/lib/genevalidator/validation_gene_merge.rb +30 -65
  36. data/lib/genevalidator/validation_length_cluster.rb +14 -53
  37. data/lib/genevalidator/validation_length_rank.rb +10 -11
  38. data/lib/genevalidator/validation_open_reading_frame.rb +18 -19
  39. data/lib/genevalidator/validation_report.rb +2 -5
  40. data/lib/genevalidator/validation_test.rb +8 -4
  41. data/lib/genevalidator/version.rb +1 -1
  42. data/test/test_all_validations.rb +51 -66
  43. data/test/test_blast.rb +68 -51
  44. data/test/test_clusterization.rb +1 -1
  45. data/test/test_clusterization_2d.rb +19 -13
  46. data/test/test_extended_array_methods.rb +1 -1
  47. data/test/test_files/all_validations_mrna/mrna.blast_tab6 +1806 -0
  48. data/test/test_files/all_validations_mrna/mrna.blast_tab7 +1865 -0
  49. data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml → mrna.blast_xml} +18642 -1
  50. data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.index → mrna.blast_xml.index} +300 -0
  51. data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta → mrna.fa} +0 -0
  52. data/test/test_files/all_validations_mrna/mrna.raw_seq +3970 -0
  53. data/test/test_files/all_validations_mrna/{all_validations_mrna.fasta.blast_xml.raw_seq.idx → mrna.raw_seq.idx} +901 -1
  54. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_tab → prot.blast_tab6} +416 -0
  55. data/test/test_files/all_validations_prot/prot.blast_tab7 +2400 -0
  56. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml → prot.blast_xml} +18299 -6723
  57. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.index → prot.blast_xml.index} +408 -0
  58. data/test/test_files/all_validations_prot/{all_validations_prot.fasta → prot.fa} +0 -0
  59. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq → prot.raw_seq} +2735 -0
  60. data/test/test_files/all_validations_prot/{all_validations_prot.fasta.blast_xml.raw_seq.idx → prot.raw_seq.idx} +3032 -1808
  61. data/test/test_sequences.rb +46 -41
  62. data/test/test_validation_open_reading_frame.rb +318 -202
  63. data/test/test_validations.rb +48 -32
  64. metadata +76 -102
  65. data/doc/AliasDuplicationError.html +0 -134
  66. data/doc/AlignmentValidation.html +0 -1687
  67. data/doc/AlignmentValidationOutput.html +0 -659
  68. data/doc/Blast.html +0 -1905
  69. data/doc/BlastRFValidationOutput.html +0 -545
  70. data/doc/BlastReadingFrameValidation.html +0 -370
  71. data/doc/BlastUtils.html +0 -875
  72. data/doc/ClasspathError.html +0 -134
  73. data/doc/Cluster.html +0 -1316
  74. data/doc/DuplciationValidationOutput.html +0 -564
  75. data/doc/DuplicationValidation.html +0 -920
  76. data/doc/DuplicationValidationOutput.html +0 -564
  77. data/doc/FileNotFoundException.html +0 -134
  78. data/doc/GeneMergeValidation.html +0 -935
  79. data/doc/GeneMergeValidationOutput.html +0 -652
  80. data/doc/HierarchicalClusterization.html +0 -994
  81. data/doc/Hsp.html +0 -1485
  82. data/doc/InconsistentTabularFormat.html +0 -135
  83. data/doc/LengthClusterValidation.html +0 -982
  84. data/doc/LengthClusterValidationOutput.html +0 -515
  85. data/doc/LengthRankValidation.html +0 -496
  86. data/doc/LengthRankValidationOutput.html +0 -517
  87. data/doc/NoInternetError.html +0 -135
  88. data/doc/NoMafftInstallationError.html +0 -134
  89. data/doc/NoPIdentError.html +0 -134
  90. data/doc/NoValidationError.html +0 -134
  91. data/doc/NotEnoughHitsError.html +0 -135
  92. data/doc/ORFValidationOutput.html +0 -593
  93. data/doc/OpenReadingFrameValidation.html +0 -1107
  94. data/doc/OtherError.html +0 -123
  95. data/doc/Output.html +0 -1540
  96. data/doc/Pair.html +0 -309
  97. data/doc/PairCluster.html +0 -767
  98. data/doc/Plot.html +0 -837
  99. data/doc/QueryError.html +0 -134
  100. data/doc/ReportClassError.html +0 -135
  101. data/doc/Sequence.html +0 -1299
  102. data/doc/SequenceTypeError.html +0 -135
  103. data/doc/TabularEntry.html +0 -837
  104. data/doc/TabularParser.html +0 -1104
  105. data/doc/Validation.html +0 -2147
  106. data/doc/ValidationClassError.html +0 -134
  107. data/doc/ValidationOutput.html +0 -460
  108. data/doc/ValidationReport.html +0 -940
  109. data/doc/ValidationTest.html +0 -939
  110. data/doc/_index.html +0 -449
  111. data/doc/class_list.html +0 -54
  112. data/doc/css/common.css +0 -1
  113. data/doc/css/full_list.css +0 -57
  114. data/doc/css/style.css +0 -338
  115. data/doc/file.README.html +0 -151
  116. data/doc/file_list.html +0 -56
  117. data/doc/frames.html +0 -26
  118. data/doc/index.html +0 -151
  119. data/doc/js/app.js +0 -214
  120. data/doc/js/full_list.js +0 -178
  121. data/doc/js/jquery.js +0 -4
  122. data/doc/method_list.html +0 -1505
  123. data/doc/top-level-namespace.html +0 -112
  124. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab +0 -967
  125. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.index +0 -967
  126. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq +0 -4929
  127. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_tab.raw_seq.idx +0 -1006
  128. data/test/test_files/all_validations_mrna/all_validations_mrna.fasta.blast_xml.raw_seq +0 -2075
  129. data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.index +0 -1864
  130. data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq +0 -42411
  131. data/test/test_files/all_validations_prot/all_validations_prot.fasta.blast_tab.raw_seq.idx +0 -3751
@@ -1,159 +1,94 @@
1
+ require 'fileutils'
2
+
3
+ require 'bio-blastxmlparser'
4
+
1
5
  require 'genevalidator/arg_validation'
2
- require 'genevalidator/get_raw_sequences'
3
- require 'genevalidator/tabular_parser'
4
6
  require 'genevalidator/blast'
5
- require 'genevalidator/output'
6
7
  require 'genevalidator/exceptions'
7
- require 'genevalidator/validation_length_cluster'
8
- require 'genevalidator/validation_length_rank'
9
- require 'genevalidator/validation_blast_reading_frame'
10
- require 'genevalidator/validation_gene_merge'
11
- require 'genevalidator/validation_duplication'
12
- require 'genevalidator/validation_open_reading_frame'
13
- require 'genevalidator/validation_alignment'
14
- require 'genevalidator/pool'
15
- require 'bio-blastxmlparser'
16
- require 'open-uri'
17
- require 'uri'
18
- require 'io/console'
19
- require 'yaml'
20
- require 'thread'
8
+ require 'genevalidator/get_raw_sequences'
9
+ require 'genevalidator/output'
10
+ require 'genevalidator/tabular_parser'
11
+ require 'genevalidator/validation'
21
12
 
22
13
  # Top level module / namespace.
23
14
  module GeneValidator
24
- Pair1 = Struct.new(:x, :y)
25
-
26
- # Main Class that initalises and then runs validations.
27
- class Validation
28
- attr_reader :opt
29
- attr_reader :type
30
- attr_reader :input_fasta_file
31
- attr_reader :html_path
32
- attr_reader :yaml_path
33
- attr_reader :filename
15
+ class << self
16
+ attr_accessor :opt, :config, :overview
34
17
  attr_reader :raw_seq_file_index
35
18
  attr_reader :raw_seq_file_load
36
- attr_accessor :idx # current number of the querry processed
37
- attr_reader :start_idx
38
19
  # array of indexes for the start offsets of each query in the fasta file
39
- attr_reader :query_offset_lst
40
- attr_reader :overall_evaluation
41
-
42
- # global variables
43
- attr_reader :no_queries
44
- attr_reader :scores
45
- attr_reader :good_predictions
46
- attr_reader :bad_predictions
47
- attr_reader :nee
48
- attr_reader :no_mafft
49
- attr_reader :no_internet
50
- attr_reader :map_errors
51
- attr_reader :map_running_times
52
-
53
- attr_reader :threads
54
- attr_reader :mutex
55
- attr_reader :mutex_yaml
56
- attr_reader :mutex_html
57
- attr_reader :mutex_array
58
-
59
- ##
60
- # Initilizes the object
61
- # Params:
62
- # +opt+: A hash - Default Values: {validations: ['all'],
63
- # blast_tabular_file: nil, blast_tabular_options: nil, blast_xml_file: nil,
64
- # db: 'remote', raw_sequences: nil, num_threads: 1 fast: false}
65
- # +start_idx+: number of the sequence from the file to start with
66
- # +overall_evaluation+: boolean variable for printing overall evaluation
67
- def initialize(opt, start_idx = 1, overall_evaluation = true)
68
- # Validate opts
69
- @opt = GVArgValidation.validate_args(opt)
70
-
71
- puts "\nDepending on your input and your computational resources, this"\
72
- ' may take a while. Please wait...'
73
-
74
- @idx = 0
75
- @start_idx = start_idx
76
-
77
- @overall_evaluation = overall_evaluation
78
-
79
- # start a worker thread
80
- @threads = [] # used for parallelizing the validations.
81
- @mutex = Mutex.new
82
- @mutex_yaml = Mutex.new
83
- @mutex_html = Mutex.new
84
- @mutex_array = Mutex.new
85
-
86
- # global variables
87
- @no_queries = 0
88
- @scores = []
89
- @good_predictions = 0
90
- @bad_predictions = 0
91
- @nee = 0
92
- @no_mafft = 0
93
- @no_internet = 0
94
- @map_errors = Hash.new(0)
95
- @map_running_times = Hash.new(Pair1.new(0, 0))
96
-
97
- @type = determine_sequence_type
98
- @query_offset_lst = index_the_input
99
-
100
- # build the path of html folder output
101
- dir = File.dirname(@opt[:input_fasta_file])
102
- @filename = File.basename(@opt[:input_fasta_file])
103
- @yaml_path = dir
104
- @html_path = "#{opt[:input_fasta_file]}.html"
105
- @plot_dir = "#{@html_path}/files/json"
106
-
107
- # create 'html' directory
108
- Dir.mkdir(@html_path)
109
- # copy auxiliar folders to the html folder
110
- aux = File.join(File.dirname(File.expand_path(__FILE__)), '../aux/files')
111
- FileUtils.cp_r(aux, @html_path)
20
+ attr_reader :query_idx
21
+ attr_accessor :mutex, :mutex_html, :mutex_json, :mutex_array
22
+
23
+ def init(opt, start_idx = 1, summary = true)
24
+ $stderr.puts 'Analysing input arguments'
25
+ @opt = opt
26
+ GVArgValidation.validate_args # validates @opt
27
+
28
+ @config = {
29
+ idx: 0,
30
+ start_idx: start_idx,
31
+ summary: summary,
32
+
33
+ type: BlastUtils.guess_sequence_type_from_input_file,
34
+ filename: File.basename(@opt[:input_fasta_file]),
35
+ html_path: "#{@opt[:input_fasta_file]}.html",
36
+ json_file: File.join(File.dirname(@opt[:input_fasta_file]),
37
+ "#{File.basename(@opt[:input_fasta_file])}.json"),
38
+ plot_dir: "#{@opt[:input_fasta_file]}.html/files/json",
39
+ aux: File.expand_path(File.join(File.dirname(__FILE__), '../aux')),
40
+
41
+ json_output: [],
42
+ run_no: 0,
43
+ output_max: 2500 # max no. of queries in the output file
44
+ }
45
+
46
+ @overview = {
47
+ no_queries: 0,
48
+ scores: [],
49
+ good_scores: 0,
50
+ bad_scores: 0,
51
+ nee: 0,
52
+ no_mafft: 0,
53
+ no_internet: 0,
54
+ map_errors: Hash.new(0),
55
+ run_time: Hash.new(Pair1.new(0, 0))
56
+ }
57
+
58
+ @mutex = Mutex.new
59
+ @mutex_array = Mutex.new
60
+ @mutex_html = Mutex.new
61
+ @mutex_json = Mutex.new
62
+ create_output_folder
63
+ index_the_input
64
+ RawSequences.index_raw_seq_file if @opt[:raw_sequences]
112
65
  end
113
66
 
114
67
  ##
115
68
  # Parse the blast output and run validations
116
69
  def run
117
- # Run BLAST on all sequences
118
- run_blast_on_the_input_file if @opt[:fast]
119
-
70
+ # Run BLAST on all sequences (generates @opt[:blast_xml_file])
71
+ # if no BLAST OUTPUT file provided...
120
72
  unless @opt[:blast_xml_file] || @opt[:blast_tabular_file]
121
- # run BLAST on each sequence individually & then run validations
122
- run_blast_on_each_sequence
123
- else
124
- # Extract raw sequences of hits
125
- extract_raw_sequences_of_blast_hits unless @opt[:raw_sequences]
126
- create_an_index_file_of_raw_seq_file(@opt[:raw_sequences])
127
- # Run Validations
128
- iterator = parse_blast_output_file
129
- run_validations(iterator)
73
+ BlastUtils.run_blast_on_input_file
130
74
  end
131
- return unless @overall_evaluation
132
- Output.print_footer(@no_queries, @scores, @good_predictions,
133
- @bad_predictions, @nee, @no_mafft, @no_internet,
134
- @map_errors, @map_running_times, @html_path,
135
- @filename)
136
- end
137
-
138
- def determine_sequence_type
139
- BlastUtils.guess_sequence_type_from_file(@opt[:input_fasta_file])
75
+ # Obtain fasta file of all BLAST hits
76
+ RawSequences.run unless @opt[:raw_sequences]
77
+ # Run Validations
78
+ iterator = parse_blast_output_file
79
+ (Validations.new).run_validations(iterator)
80
+
81
+ Output.write_json_file(@config[:json_output], @config[:json_file])
82
+ Output.print_footer(@overview, @config)
140
83
  end
141
84
 
142
85
  ##
143
- # Runs BLAST on the input file - only run when the opt[:fast] is true
144
- def run_blast_on_the_input_file
145
- return if @opt[:blast_xml_file] || @opt[:blast_tabular_file]
146
- puts 'Running BLAST'
147
- @opt[:blast_xml_file] = @opt[:input_fasta_file] + '.blast_xml'
148
- BlastUtils.run_blast_on_file(@opt)
149
- end
150
-
151
- ##
152
- # Extracts raw sequences of all blast hits
153
- def extract_raw_sequences_of_blast_hits
154
- puts 'Extracting sequences within the BLAST output file from the BLAST' \
155
- ' database'
156
- @opt[:raw_sequences] = GetRawSequences.run(@opt)
86
+ # Creates the output folder and copies the auxiliar folders to this folder
87
+ def create_output_folder(output_dir = @config[:html_path],
88
+ aux_dir = @config[:aux])
89
+ Dir.mkdir(output_dir)
90
+ aux_files = File.join(aux_dir, 'files/')
91
+ FileUtils.cp_r(aux_files, output_dir)
157
92
  end
158
93
 
159
94
  ##
@@ -162,39 +97,8 @@ module GeneValidator
162
97
  # start and end positions of each query.
163
98
  def index_the_input
164
99
  fasta_content = IO.binread(@opt[:input_fasta_file])
165
- offset_array = fasta_content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
166
- offset_array.push(fasta_content.length)
167
- fasta_content = nil
168
- offset_array
169
- end
170
-
171
- ##
172
- # Index the raw sequences file...
173
- def create_an_index_file_of_raw_seq_file(raw_sequence_file)
174
- # leave only the identifiers in the fasta description
175
- content = File.open(raw_sequence_file, 'rb').read.gsub(/ .*/, '')
176
- File.open(raw_sequence_file, 'w+') { |f| f.write(content) }
177
-
178
- # index the fasta file
179
- keys = content.scan(/>(.*)\n/).flatten
180
- values = content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
181
-
182
- # make an index hash
183
- index_hash = {}
184
- keys.each_with_index do |k, i|
185
- start = values[i]
186
- endf = (i == values.length - 1) ? content.length - 1 : values[i + 1]
187
- index_hash[k] = [start, endf]
188
- end
189
-
190
- # create FASTA index
191
- @raw_seq_file_index = "#{raw_sequence_file}.idx"
192
- @raw_seq_file_load = index_hash
193
-
194
- File.open(@raw_seq_file_index, 'w') do |f|
195
- YAML.dump(index_hash, f)
196
- end
197
- content = nil
100
+ @query_idx = fasta_content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }
101
+ @query_idx.push(fasta_content.length)
198
102
  end
199
103
 
200
104
  ##
@@ -206,293 +110,9 @@ module GeneValidator
206
110
  if @opt[:blast_xml_file]
207
111
  Bio::BlastXMLParser::XmlIterator.new(@opt[:blast_xml_file]).to_enum
208
112
  else
209
- TabularParser.new(@opt[:blast_tabular_file],
210
- @opt[:blast_tabular_options], @type)
113
+ TabularParser.new
211
114
  end
212
115
  ## TODO: Add a Rescue statement - e.g. if unable to create the Object...
213
116
  end
214
-
215
- ##
216
- #
217
- def run_blast_on_each_sequence
218
- # file seek for each query
219
- @query_offset_lst[0..@query_offset_lst.length - 2].each_with_index do |_pos, i|
220
- if (i + 1) >= @start_idx
221
- start_offset = @query_offset_lst[i + 1] - @query_offset_lst[i]
222
- end_offset = @query_offset_lst[i]
223
- query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
224
-
225
- # call blast with the default parameters
226
- blast_type = (type == :protein) ? 'blastp' : 'blastx'
227
- blast_xml_output = BlastUtils.run_blast(blast_type, query, @opt[:db],
228
- @opt[:num_threads])
229
- iterator = Bio::BlastXMLParser::NokogiriBlastXml.new(blast_xml_output).to_enum
230
- run_validations(iterator)
231
- else
232
- @idx += 1
233
- end
234
- end
235
- end
236
-
237
- ##
238
- #
239
- def run_validations(iterator)
240
- p = Pool.new(@opt[:num_threads]) if @opt[:num_threads] > 1
241
-
242
- while @idx + 1 < @query_offset_lst.length
243
- prediction = get_info_on_each_query_sequence
244
- @idx += 1
245
-
246
- hits = parse_next_iteration(iterator, prediction)
247
-
248
- if hits.nil?
249
- @idx -= 1
250
- break
251
- end
252
- current_idx = @idx
253
- # the first validation should be treated separately
254
- if current_idx == @start_idx || @opt[:num_threads] == 1
255
- validate(prediction, hits, current_idx)
256
- else
257
- p.schedule(prediction, hits, current_idx) do |prediction, hits, current_idx|
258
- validate(prediction, hits, current_idx)
259
- end
260
- end
261
- end
262
- ensure
263
- p.shutdown if @opt[:num_threads] > 1
264
- end
265
-
266
- def parse_next_iteration(iterator, prediction)
267
- iterator.next if @idx < @start_idx
268
- if @opt[:blast_xml_file]
269
- BlastUtils.parse_next(iterator, @type)
270
- elsif @opt[:blast_tabular_file]
271
- iterator.parse_next(prediction.identifier)
272
- end
273
- end
274
-
275
- ##
276
- # get info about the query
277
- def get_info_on_each_query_sequence
278
- prediction = Sequence.new
279
- start_offset = @query_offset_lst[idx + 1] - @query_offset_lst[idx]
280
- end_offset = @query_offset_lst[idx]
281
- query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset)
282
- parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
283
-
284
- prediction.definition = parse_query[0].gsub("\n", '')
285
- prediction.identifier = prediction.definition.gsub(/ .*/, '')
286
- prediction.type = @type
287
- prediction.raw_sequence = parse_query[1].gsub("\n", '')
288
- prediction.length_protein = prediction.raw_sequence.length
289
- prediction.length_protein /= 3 if @type == :nucleotide
290
- prediction
291
- end
292
-
293
- ##
294
- # Validate one query and create validation report
295
- # Params:
296
- # +prediction+: Sequence object
297
- # +hits+: Array of +Sequence+ objects
298
- # +idx+: the index number of the query
299
- def validate(prediction, hits, current_idx)
300
- query_output = do_validations(prediction, hits, current_idx)
301
- query_output.generate_html
302
- query_output.print_output_file_yaml
303
- query_output.print_output_console
304
-
305
- validations = query_output.validations
306
-
307
- no_mafft = 0
308
- no_internet = 0
309
- errors = []
310
- validations.each do |v|
311
- unless v.errors.nil?
312
- no_mafft += v.errors.select { |e| e == NoMafftInstallationError }.length
313
- no_internet += v.errors.select { |e| e == NoInternetError }.length
314
- end
315
- errors.push(v.short_header) if v.validation == :error
316
- end
317
-
318
- no_evidence = validations.count { |v| v.result == :unapplicable || v.result == :warning } == validations.length
319
- nee = (no_evidence) ? 1 : 0
320
-
321
- good_predictions = (query_output.overall_score >= 75) ? 1 : 0
322
- bad_predictions = (query_output.overall_score >= 75) ? 0 : 1
323
-
324
- @mutex_array.synchronize do
325
- @no_queries += 1
326
- @scores.push(query_output.overall_score)
327
- @good_predictions += good_predictions
328
- @bad_predictions += bad_predictions
329
- @nee += nee
330
- @no_mafft += no_mafft
331
- @no_internet += no_internet
332
- errors.each { |err| @map_errors[err] += 1 }
333
-
334
- validations.each do |v|
335
- next if v.running_time == 0 || v.running_time.nil?
336
- next if v.validation == :unapplicable || v.validation == :error
337
- p = Pair1.new(@map_running_times[v.short_header].x + v.running_time, @map_running_times[v.short_header].y + 1)
338
- @map_running_times[v.short_header] = p
339
- end
340
- end
341
- query_output
342
- end
343
-
344
- ##
345
- # Removes identical hits
346
- # Params:
347
- # +prediction+: Sequence object
348
- # +hits+: Array of +Sequence+ objects
349
- # Output:
350
- # new array of hit +Sequence+ objects
351
- def remove_identical_hits(prediction, hits)
352
- # remove the identical hits
353
- # identical hit means 100%coverage and >99% identity
354
- identical_hits = []
355
- hits.each do |hit|
356
- # check if all hsps have identity more than 99%
357
- low_identity = hit.hsp_list.select { |hsp| hsp.pidentity.nil? || hsp.pidentity < 99 }
358
-
359
- # check the coverage
360
- coverage = Array.new(prediction.length_protein, 0)
361
- hit.hsp_list.each do |hsp|
362
- len = hsp.match_query_to - hsp.match_query_from + 1
363
- coverage[hsp.match_query_from - 1..hsp.match_query_to - 1] = Array.new(len, 1)
364
- end
365
-
366
- if low_identity.length == 0 && coverage.uniq.length == 1
367
- identical_hits.push(hit)
368
- end
369
- end
370
-
371
- identical_hits.each { |hit| hits.delete(hit) }
372
- hits
373
- end
374
-
375
- ##
376
- # Runs all the validations and prints the outputs given the current
377
- # prediction query and the corresponding hits
378
- # Params:
379
- # +prediction+: Sequence object
380
- # +hits+: Array of +Sequence+ objects
381
- # +idx+: the index number of the query
382
- # Output:
383
- # +Output+ object
384
- def do_validations(prediction, hits, current_idx)
385
- begin
386
- hits = remove_identical_hits(prediction, hits)
387
- rescue Exception => error # NoPIdentError
388
- end
389
-
390
- query_output = Output.new(@mutex, @mutex_yaml, @mutex_html,
391
- @filename, @html_path,
392
- @yaml_path, current_idx, @start_idx)
393
- query_output.prediction_len = prediction.length_protein
394
- query_output.prediction_def = prediction.definition
395
- query_output.nr_hits = hits.length
396
-
397
- plot_path = File.join(@plot_dir, "#{@filename}_#{current_idx}")
398
-
399
- validations = []
400
- validations.push LengthClusterValidation.new(@type, prediction, hits,
401
- plot_path)
402
- validations.push LengthRankValidation.new(@type, prediction, hits)
403
- validations.push GeneMergeValidation.new(@type, prediction, hits,
404
- plot_path)
405
- validations.push DuplicationValidation.new(@type, prediction, hits,
406
- @opt[:raw_sequences],
407
- @raw_seq_file_index,
408
- @raw_seq_file_load, @opt[:db],
409
- @opt[:num_threads])
410
- validations.push BlastReadingFrameValidation.new(@type, prediction, hits)
411
- validations.push OpenReadingFrameValidation.new(@type, prediction, hits,
412
- plot_path)
413
- validations.push AlignmentValidation.new(@type, prediction, hits,
414
- plot_path, @opt[:raw_sequences],
415
- @raw_seq_file_index,
416
- @raw_seq_file_load,
417
- @opt[:db], @opt[:num_threads])
418
-
419
- validations = validations.select { |v| @opt[:validations].include? v.cli_name.downcase }
420
-
421
- # check the class type of the elements in the list
422
- validations.each do |v|
423
- fail ValidationClassError unless v.is_a? ValidationTest
424
- end
425
-
426
- # check alias duplication
427
- aliases = validations.map(&:cli_name)
428
- fail AliasDuplicationError unless aliases.length == aliases.uniq.length
429
-
430
- validations.each do |v|
431
- v.run
432
- fail ReportClassError unless v.validation_report.is_a? ValidationReport
433
- end
434
- query_output.validations = validations.map(&:validation_report)
435
-
436
- fail NoValidationError if query_output.validations.length == 0
437
-
438
- # compute validation score
439
- compute_scores(query_output)
440
- query_output
441
-
442
- rescue ValidationClassError => error
443
- error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
444
- $stderr.print "Class Type error at #{error_line}." \
445
- ' Possible cause: type of one of the validations is not' \
446
- " ValidationTest\n"
447
- exit 1
448
- rescue NoValidationError => error
449
- error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
450
- $stderr.print "Validation error at #{error_line}." \
451
- " Possible cause: your -v arguments are not valid aliases\n"
452
- exit 1
453
- rescue ReportClassError => error
454
- error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
455
- $stderr.print "Class Type error at #{error_line}."\
456
- ' Possible cause: type of one of the validation reports' \
457
- " returned by the 'run' method is not ValidationReport\n"
458
- exit 1
459
- rescue AliasDuplicationError => error
460
- error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0]
461
- $stderr.print "Alias Duplication error at #{error_line}."\
462
- ' Possible cause: At least two validations have the same' \
463
- " CLI alias\n"
464
- exit 1
465
- end
466
-
467
- def compute_scores(query_output)
468
- validations = query_output.validations
469
- successes = validations.map { |v| v.result == v.expected }.count(true)
470
-
471
- fails = validations.map { |v| v.validation != :unapplicable &&
472
- v.validation != :error &&
473
- v.result != v.expected }.count(true)
474
-
475
- lcv = validations.select { |v| v.class == LengthClusterValidationOutput }
476
- lrv = validations.select { |v| v.class == LengthRankValidationOutput }
477
- if lcv.length == 1 && lrv.length == 1
478
- score_lcv = (lcv[0].result == lcv[0].expected)
479
- score_lrv = (lrv[0].result == lrv[0].expected)
480
- # if both are true this should be counted as a single success
481
- if score_lcv == true && score_lrv == true
482
- successes -= 1
483
- elsif score_lcv == false && score_lrv == false
484
- # if both are false this will be a fail
485
- fails -= 1
486
- else
487
- successes -= 0.5
488
- fails -= 0.5
489
- end
490
- end
491
-
492
- query_output.successes = successes
493
- query_output.fails = fails
494
- total_query = successes.to_i + fails
495
- query_output.overall_score = (successes * 100 / (total_query)).round(0)
496
- end
497
117
  end
498
118
  end