genevalidator 1.6.12 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +30 -1
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +13 -12
  5. data/Gemfile +4 -1
  6. data/Gemfile.lock +135 -0
  7. data/README.md +104 -122
  8. data/Rakefile +377 -5
  9. data/aux/gv_results.slim +155 -0
  10. data/aux/html_files/css/gv.compiled.min.css +8 -0
  11. data/aux/{files → html_files}/css/src/bootstrap.min.css +0 -0
  12. data/aux/{files → html_files}/css/src/font-awesome.min.css +0 -0
  13. data/aux/{files → html_files}/css/src/style.css +0 -0
  14. data/aux/{files → html_files}/fonts/FontAwesome.otf +0 -0
  15. data/aux/{files → html_files}/fonts/fontawesome-webfont.eot +0 -0
  16. data/aux/{files → html_files}/fonts/fontawesome-webfont.svg +0 -0
  17. data/aux/{files → html_files}/fonts/fontawesome-webfont.ttf +0 -0
  18. data/aux/{files → html_files}/fonts/fontawesome-webfont.woff +0 -0
  19. data/aux/{files → html_files}/img/gene.png +0 -0
  20. data/aux/html_files/js/gv.compiled.min.js +1 -0
  21. data/aux/{files → html_files}/js/src/bootstrap.min.js +0 -0
  22. data/aux/{files → html_files}/js/src/d3.v3.min.js +0 -0
  23. data/aux/{files → html_files}/js/src/jquery-2.1.1.min.js +0 -0
  24. data/aux/{files → html_files}/js/src/jquery.tablesorter.min.js +0 -0
  25. data/aux/{files → html_files}/js/src/plots.js +1 -1
  26. data/aux/{files → html_files}/js/src/script.js +0 -0
  27. data/aux/{files → html_files}/json/.gitkeep +0 -0
  28. data/bin/genevalidator +393 -56
  29. data/exemplar_data/README.md +60 -0
  30. data/{data/mrna_data.fasta → exemplar_data/mrna_data.fa} +1 -1
  31. data/{data/protein_data.fasta → exemplar_data/protein_data.fa} +0 -0
  32. data/genevalidator.gemspec +35 -20
  33. data/install.sh +92 -0
  34. data/lib/genevalidator.rb +171 -56
  35. data/lib/genevalidator/arg_validation.rb +26 -55
  36. data/lib/genevalidator/blast.rb +44 -99
  37. data/lib/genevalidator/clusterization.rb +18 -22
  38. data/lib/genevalidator/exceptions.rb +17 -17
  39. data/lib/genevalidator/ext/array.rb +21 -4
  40. data/lib/genevalidator/get_raw_sequences.rb +32 -31
  41. data/lib/genevalidator/hsp.rb +31 -2
  42. data/lib/genevalidator/json_to_gv_results.rb +38 -122
  43. data/lib/genevalidator/output.rb +158 -172
  44. data/lib/genevalidator/output_files.rb +134 -0
  45. data/lib/genevalidator/pool.rb +2 -5
  46. data/lib/genevalidator/query.rb +1 -1
  47. data/lib/genevalidator/tabular_parser.rb +8 -29
  48. data/lib/genevalidator/validation.rb +48 -90
  49. data/lib/genevalidator/validation_alignment.rb +64 -75
  50. data/lib/genevalidator/validation_blast_reading_frame.rb +13 -9
  51. data/lib/genevalidator/validation_duplication.rb +85 -84
  52. data/lib/genevalidator/validation_gene_merge.rb +46 -35
  53. data/lib/genevalidator/validation_length_cluster.rb +18 -15
  54. data/lib/genevalidator/validation_length_rank.rb +19 -15
  55. data/lib/genevalidator/validation_maker_qi.rb +13 -12
  56. data/lib/genevalidator/validation_open_reading_frame.rb +16 -13
  57. data/lib/genevalidator/validation_report.rb +1 -1
  58. data/lib/genevalidator/validation_test.rb +1 -1
  59. data/lib/genevalidator/version.rb +1 -1
  60. data/test/overall.rb +1 -1
  61. data/test/test_all_validations.rb +36 -24
  62. data/test/test_blast.rb +39 -24
  63. data/test/test_clusterization_2d.rb +4 -4
  64. data/test/test_helper.rb +2 -2
  65. data/test/test_query.rb +16 -20
  66. data/test/test_validation_open_reading_frame.rb +122 -122
  67. data/test/test_validations.rb +12 -10
  68. metadata +94 -79
  69. data/aux/files/css/genevalidator.compiled.min.css +0 -16
  70. data/aux/files/js/genevalidator.compiled.min.js +0 -28
  71. data/aux/json_footer.erb +0 -8
  72. data/aux/json_header.erb +0 -19
  73. data/aux/json_query.erb +0 -15
  74. data/aux/template_footer.erb +0 -8
  75. data/aux/template_header.erb +0 -19
  76. data/aux/template_query.erb +0 -14
  77. data/data/README.md +0 -57
  78. data/data/mrna_data.fasta.blast_tabular +0 -3567
  79. data/data/mrna_data.fasta.blast_tabular.raw_seq +0 -53998
  80. data/data/mrna_data.fasta.blast_tabular.raw_seq.idx +0 -5440
  81. data/data/mrna_data.fasta.blast_xml +0 -39800
  82. data/data/mrna_data.fasta.blast_xml.raw_seq +0 -2554
  83. data/data/mrna_data.fasta.blast_xml.raw_seq.idx +0 -3127
  84. data/data/mrna_data.fasta.json +0 -1
  85. data/data/protein_data.fasta.blast_tabular +0 -3278
  86. data/data/protein_data.fasta.blast_tabular.raw_seq +0 -61295
  87. data/data/protein_data.fasta.blast_tabular.raw_seq.idx +0 -4438
  88. data/data/protein_data.fasta.blast_xml +0 -26228
  89. data/data/protein_data.fasta.blast_xml.raw_seq +0 -9803
  90. data/data/protein_data.fasta.blast_xml.raw_seq.idx +0 -1777
  91. data/data/protein_data.fasta.json +0 -1
@@ -1,4 +1,3 @@
1
- # coding: utf-8
2
1
  # From http://burgestrand.se/code/ruby-thread-pool/
3
2
  #
4
3
  # Copyright © 2012, Kim Burgestrand kim@burgestrand.se
@@ -21,8 +20,6 @@
21
20
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
21
  # SOFTWARE.
23
22
 
24
- require 'thread'
25
-
26
23
  # Class that creates a thread safe pool.
27
24
  class Pool
28
25
  def initialize(size)
@@ -58,8 +55,8 @@ if $PROGRAM_NAME == __FILE__
58
55
 
59
56
  20.times do |i|
60
57
  p.schedule do
61
- sleep rand(4) + 2
62
- $stderr.puts "Job #{i} finished by thread #{Thread.current[:id]}\n"
58
+ sleep rand(2..5)
59
+ warn "Job #{i} finished by thread #{Thread.current[:id]}\n"
63
60
  end
64
61
  end
65
62
  at_exit { p.shutdown }
@@ -22,7 +22,7 @@ module GeneValidator
22
22
  end
23
23
 
24
24
  def protein_translation
25
- (@type == :protein) ? raw_sequence : @protein_translation
25
+ @type == :protein ? raw_sequence : @protein_translation
26
26
  end
27
27
 
28
28
  ##
@@ -5,7 +5,6 @@ require 'genevalidator/exceptions'
5
5
  require 'genevalidator/hsp'
6
6
  require 'genevalidator/query'
7
7
 
8
- #
9
8
  module GeneValidator
10
9
  ##
11
10
  # This class parses the tabular output of BLAST (outfmt 6 & 7)
@@ -31,14 +30,9 @@ module GeneValidator
31
30
  ##
32
31
  #
33
32
  def analayse_tabular_file(filename)
34
- results = []
35
- file = File.read(filename)
36
- lines = CSV.parse(file, col_sep: "\t", skip_lines: /^#/,
37
- headers: @column_names)
38
- lines.each do |line|
39
- results << line.to_hash
40
- end
41
- results
33
+ lines = CSV.parse(File.read(filename), col_sep: "\t", skip_lines: /^#/,
34
+ headers: @column_names)
35
+ lines.map(&:to_hash)
42
36
  end
43
37
 
44
38
  ##
@@ -63,7 +57,7 @@ module GeneValidator
63
57
  move_to_next_query
64
58
  hit_seq
65
59
  rescue StopIteration
66
- return []
60
+ []
67
61
  end
68
62
 
69
63
  private
@@ -72,29 +66,14 @@ module GeneValidator
72
66
  #
73
67
  def initialise_classes(current_id, tab_results = @tab_results)
74
68
  hits = tab_results.partition { |h| h['qseqid'] == current_id }[0]
75
- hit_list = []
76
69
  grouped_hits = hits.group_by { |row| row['sseqid'] }
77
70
 
78
- grouped_hits.each do |query_id, row|
71
+ grouped_hits.map do |_query_id, rows|
79
72
  hit_seq = Query.new
80
- hit_seq.init_tabular_attribute(row[0])
81
-
82
- initialise_all_hsps(query_id, hits, hit_seq)
83
-
73
+ hit_seq.init_tabular_attribute(rows[0])
74
+ hit_seq.hsp_list = rows.map { |row| Hsp.new(tabular_input: row) }
84
75
  hit_seq.type = :protein
85
- hit_list.push(hit_seq)
86
- end
87
- hit_list
88
- end
89
-
90
- ##
91
- #
92
- def initialise_all_hsps(current_query_id, hits, hit_seq)
93
- hsps = hits.select { |row| row['sseqid'] == current_query_id }
94
- hsps.each do |row|
95
- hsp = Hsp.new
96
- hsp.init_tabular_attribute(row)
97
- hit_seq.hsp_list.push(hsp)
76
+ hit_seq
98
77
  end
99
78
  end
100
79
  end
@@ -49,11 +49,14 @@ module GeneValidator
49
49
  break
50
50
  end
51
51
 
52
+ arr_idx = @config[:idx] - 1
53
+ next unless @config[:json_output][arr_idx].nil?
54
+
52
55
  if @opt[:num_threads] == 1
53
- (Validate.new).validate(prediction, blast_hits, @config[:idx])
56
+ Validate.new.validate(prediction, blast_hits, @config[:idx])
54
57
  else
55
58
  p.schedule(prediction, blast_hits, @config[:idx]) do |pred, hits, idx|
56
- (Validate.new).validate(pred, hits, idx)
59
+ Validate.new.validate(pred, hits, idx)
57
60
  end
58
61
  end
59
62
  end
@@ -63,18 +66,16 @@ module GeneValidator
63
66
 
64
67
  ##
65
68
  # get info about the query
66
- def get_info_on_query_sequence(input_file = @opt[:input_fasta_file],
67
- seq_type = @config[:type])
68
- start_offset = @query_idx[@config[:idx] + 1] - @query_idx[@config[:idx]]
69
- end_offset = @query_idx[@config[:idx]]
70
- query = IO.binread(input_file, start_offset, end_offset)
69
+ def get_info_on_query_sequence(seq_type = @config[:type],
70
+ index = @config[:idx])
71
+ query = GeneValidator.extract_input_fasta_sequence(index)
71
72
  parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
72
73
 
73
74
  prediction = Query.new
74
- prediction.definition = parse_query[0].gsub("\n", '')
75
+ prediction.definition = parse_query[0].delete("\n")
75
76
  prediction.identifier = prediction.definition.gsub(/ .*/, '')
76
77
  prediction.type = seq_type
77
- prediction.raw_sequence = parse_query[1].gsub("\n", '')
78
+ prediction.raw_sequence = parse_query[1].delete("\n")
78
79
  prediction.length_protein = prediction.raw_sequence.length
79
80
  prediction.length_protein /= 3 if seq_type == :nucleotide
80
81
  prediction
@@ -82,10 +83,10 @@ module GeneValidator
82
83
 
83
84
  # Adds 'maker' to @opt[:validations] if the first definiton in the input
84
85
  # fasta file contains MAKER's QI (quality index) score
85
- def check_if_maker_input?(input_file = @opt[:input_fasta_file])
86
- query = IO.binread(input_file, @query_idx[1], @query_idx[0])
86
+ def check_if_maker_input?
87
+ query = GeneValidator.extract_input_fasta_sequence(0)
87
88
  parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
88
- definition = parse_query[0].gsub("\n", '')
89
+ definition = parse_query[0].delete("\n")
89
90
  number = '-?\d*\.?\d*'
90
91
  qi_match = definition.match(/QI:#{number}\|#{number}\|#{number}\|
91
92
  #{number}\|#{number}\|#{number}\|
@@ -107,8 +108,7 @@ module GeneValidator
107
108
  # Class that runs the validations (Instatiated for each query)
108
109
  class Validate
109
110
  extend Forwardable
110
- def_delegators GeneValidator, :opt, :config, :mutex_array, :overview,
111
- :query_idx
111
+ def_delegators GeneValidator, :opt, :config, :overview, :query_idx
112
112
 
113
113
  ##
114
114
  # Initilizes the object
@@ -121,7 +121,6 @@ module GeneValidator
121
121
  def initialize
122
122
  @opt = opt
123
123
  @config = config
124
- @mutex_array = mutex_array
125
124
  @run_output = nil
126
125
  @overview = overview
127
126
  @query_idx = query_idx
@@ -142,7 +141,7 @@ module GeneValidator
142
141
  @run_output.validations = vals.map(&:validation_report)
143
142
  check_validations_output(vals)
144
143
 
145
- compute_scores
144
+ compute_run_score
146
145
  generate_run_output
147
146
  end
148
147
 
@@ -154,27 +153,24 @@ module GeneValidator
154
153
  # Output:
155
154
  # new array of hit +Sequence+ objects
156
155
  def remove_identical_hits(prediction, hits)
157
- identical_hits = []
158
- hits.each do |hit|
156
+ hits.delete_if do |hit|
159
157
  low_identity = hit.hsp_list.select { |hsp| hsp.pidentity < 99 }
160
158
  no_data = hit.hsp_list.select { |hsp| hsp.pidentity.nil? }
161
159
  low_identity += no_data
162
- # check the coverage
163
- coverage = Array.new(prediction.length_protein, 0)
164
- hit.hsp_list.each do |hsp|
165
- match_to = hsp.match_query_to
166
- match_from = hsp.match_query_from
167
- len = match_to - match_from + 1
168
- coverage[match_from - 1..match_to - 1] = Array.new(len, 1)
169
- end
170
-
171
- if low_identity.length == 0 && coverage.uniq.length == 1
172
- identical_hits.push(hit)
173
- end
160
+ coverage = check_hit_coverage(prediction, hit)
161
+ low_identity.empty? && coverage.uniq.length == 1
174
162
  end
163
+ end
175
164
 
176
- identical_hits.each { |hit| hits.delete(hit) }
177
- hits
165
+ def check_hit_coverage(prediction, hit)
166
+ coverage = Array.new(prediction.length_protein, 0)
167
+ hit.hsp_list.each do |hsp|
168
+ match_to = hsp.match_query_to
169
+ match_from = hsp.match_query_from
170
+ len = match_to - match_from + 1
171
+ coverage[match_from - 1..match_to - 1] = Array.new(len, 1)
172
+ end
173
+ coverage
178
174
  end
179
175
 
180
176
  def create_validation_tests(prediction, hits)
@@ -197,45 +193,48 @@ module GeneValidator
197
193
 
198
194
  def check_validations(vals)
199
195
  # check the class type of the elements in the list
200
- vals.each { |v| fail ValidationClassError unless v.is_a? ValidationTest }
196
+ vals.each { |v| raise ValidationClassError unless v.is_a? ValidationTest }
201
197
  # check alias duplication
202
198
  aliases = vals.map(&:cli_name)
203
- fail AliasDuplicationError unless aliases.length == aliases.uniq.length
199
+ raise AliasDuplicationError unless aliases.length == aliases.uniq.length
204
200
  rescue ValidationClassError => e
205
- $stderr.puts e
201
+ warn e
206
202
  exit 1
207
203
  rescue AliasDuplicationError => e
208
- $stderr.puts e
204
+ warn e
209
205
  exit 1
210
206
  end
211
207
 
212
208
  def check_validations_output(vals)
213
- fail NoValidationError if @run_output.validations.length == 0
209
+ raise NoValidationError if @run_output.validations.empty?
214
210
  vals.each do |v|
215
- fail ReportClassError unless v.validation_report.is_a? ValidationReport
211
+ raise ReportClassError unless v.validation_report.is_a? ValidationReport
216
212
  end
217
213
  rescue NoValidationError => e
218
- $stderr.puts e
214
+ warn e
219
215
  exit 1
220
216
  rescue ReportClassError => e
221
- $stderr.puts e
217
+ warn e
222
218
  exit 1
223
219
  end
224
220
 
225
- def compute_scores
221
+ def compute_run_score
226
222
  validations = @run_output.validations
227
223
  scores = {}
228
224
  scores[:successes] = validations.count { |v| v.result == v.expected }
229
- scores[:fails] = validations.count { |v| v.validation != :unapplicable && v.validation != :error && v.result != v.expected }
230
- scores = length_validation_scores(validations, scores)
225
+ scores[:fails] = validations.count do |v|
226
+ v.validation != :unapplicable && v.validation != :error &&
227
+ v.result != v.expected
228
+ end
229
+ scores = length_validation_scores(validations, scores)
231
230
 
232
- @run_output.successes = scores[:successes]
233
- @run_output.fails = scores[:fails]
234
- total_query = scores[:successes].to_i + scores[:fails]
235
- if total_query == 0
231
+ @run_output.successes = scores[:successes]
232
+ @run_output.fails = scores[:fails]
233
+ num_total_validations = scores[:successes].to_i + scores[:fails]
234
+ if num_total_validations.zero?
236
235
  @run_output.overall_score = 0
237
236
  else
238
- @run_output.overall_score = (scores[:successes] * 90 / total_query).round
237
+ @run_output.overall_score = (scores[:successes] * 90 / num_total_validations).round
239
238
  end
240
239
  end
241
240
 
@@ -260,49 +259,8 @@ module GeneValidator
260
259
  end
261
260
 
262
261
  def generate_run_output
263
- @run_output.generate_html
264
- @run_output.generate_json
265
262
  @run_output.print_output_console
266
- generate_run_overview
267
- end
268
-
269
- def generate_run_overview
270
- vals = @run_output.validations
271
- no_mafft = 0
272
- no_internet = 0
273
- errors = []
274
- vals.each do |v|
275
- unless v.errors.nil?
276
- no_mafft += v.errors.count { |e| e == NoMafftInstallationError }
277
- no_internet += v.errors.count { |e| e == NoInternetError }
278
- end
279
- errors.push(v.short_header) if v.validation == :error
280
- end
281
-
282
- no_evidence = vals.count { |v| v.result == :unapplicable || v.result == :warning } == vals.length
283
- nee = (no_evidence) ? 1 : 0
284
-
285
- good_scores = (@run_output.overall_score >= 75) ? 1 : 0
286
- bad_scores = (@run_output.overall_score >= 75) ? 0 : 1
287
-
288
- @mutex_array.synchronize do
289
- @overview[:no_queries] += 1
290
- @overview[:scores].push(@run_output.overall_score)
291
- @overview[:good_scores] += good_scores
292
- @overview[:bad_scores] += bad_scores
293
- @overview[:nee] += nee
294
- @overview[:no_mafft] += no_mafft
295
- @overview[:no_internet] += no_internet
296
- errors.each { |err| @overview[:map_errors][err] += 1 }
297
-
298
- vals.each do |v|
299
- next if v.run_time == 0 || v.run_time.nil?
300
- next if v.validation == :unapplicable || v.validation == :error
301
- p = Pair1.new(@overview[:run_time][v.short_header].x + v.run_time,
302
- @overview[:run_time][v.short_header].y + 1)
303
- @overview[:run_time][v.short_header] = p
304
- end
305
- end
263
+ @run_output.generate_json
306
264
  end
307
265
  end
308
266
  end
@@ -17,9 +17,11 @@ module GeneValidator
17
17
  attr_reader :result
18
18
 
19
19
  def initialize(short_header, header, description, gaps = 0, extra_seq = 0,
20
- consensus = 1, threshold = 20, expected = :yes)
20
+ consensus = 1, threshold = 20, expected = :yes)
21
21
 
22
- @short_header, @header, @description = short_header, header, description
22
+ @short_header = short_header
23
+ @header = header
24
+ @description = description
23
25
  @gaps = (gaps * 100).round.to_s + '%'
24
26
  @extra_seq = (extra_seq * 100).round.to_s + '%'
25
27
  @consensus = (consensus * 100).round.to_s + '%'
@@ -47,7 +49,9 @@ module GeneValidator
47
49
  else
48
50
  t = 'These results suggest that there may be some problems with' \
49
51
  ' the query sequence.'
50
- t1, t2, t3 = '', '', '' # Create empty string variables
52
+ t1 = ''
53
+ t2 = ''
54
+ t3 = '' # Create empty string variables
51
55
  if (1 - consensus.to_i) > @threshold
52
56
  t1 = ' There is low conservation of residues between the' \
53
57
  ' statistical profile and the query sequence (the cut-off' \
@@ -114,7 +118,7 @@ module GeneValidator
114
118
  @raw_seq_file_load = config[:raw_seq_file_load]
115
119
  @db = opt[:db]
116
120
  @multiple_alignment = []
117
- @num_threads = opt[:num_threads]
121
+ @num_threads = opt[:mafft_threads]
118
122
  @type = config[:type]
119
123
  end
120
124
 
@@ -123,32 +127,31 @@ module GeneValidator
123
127
  # of the first n hits
124
128
  # Output:
125
129
  # +AlignmentValidationOutput+ object
126
- def run(n = 10)
130
+ def run
131
+ n = opt[:min_blast_hits] < 10 ? 10 : opt[:min_blast_hits]
127
132
  n = 50 if n > 50
128
133
 
129
- fail NotEnoughHitsError unless hits.length >= n
130
- fail unless prediction.is_a?(Query) && hits[0].is_a?(Query)
134
+ raise NotEnoughHitsError if hits.length < n
135
+ raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)
131
136
 
132
137
  start = Time.new
133
138
  # get the first n hits
134
- less_hits = @hits[0..[n - 1, @hits.length].min]
135
- useless_hits = []
136
-
139
+ n_hits = [n - 1, @hits.length].min
140
+ less_hits = @hits[0..n_hits]
137
141
  # get raw sequences for less_hits
138
- less_hits.map do |hit|
139
- next unless hit.raw_sequence.nil?
140
- hit.raw_sequence = FetchRawSequences.run(hit.identifier,
141
- hit.accession_no)
142
- useless_hits.push(hit) if hit.raw_sequence.nil?
142
+ less_hits.delete_if do |hit|
143
+ if hit.raw_sequence.nil?
144
+ hit.raw_sequence = FetchRawSequences.run(hit.identifier,
145
+ hit.accession_no)
146
+ end
147
+ hit.raw_sequence.nil? ? true : false
143
148
  end
144
149
 
145
- useless_hits.each { |hit| less_hits.delete(hit) }
146
-
147
- fail NoInternetError if less_hits.length == 0
150
+ raise NoInternetError if less_hits.length.zero?
148
151
  # in case of nucleotide prediction sequence translate into protein
149
152
  # translate with the reading frame of all hits considered for alignment
150
153
  reading_frames = less_hits.map(&:reading_frame).uniq
151
- fail ReadingFrameError if reading_frames.length != 1
154
+ raise ReadingFrameError if reading_frames.length != 1
152
155
 
153
156
  if @type == :nucleotide
154
157
  s = Bio::Sequence::NA.new(prediction.raw_sequence)
@@ -157,7 +160,7 @@ module GeneValidator
157
160
 
158
161
  # multiple align sequences from less_hits with the prediction
159
162
  # the prediction is the last sequence in the vector
160
- multiple_align_mafft(prediction, less_hits)
163
+ @multiple_alignment = multiple_align_mafft(prediction, less_hits)
161
164
 
162
165
  out = get_sm_pssm(@multiple_alignment[0..@multiple_alignment.length - 2])
163
166
  sm = out[0]
@@ -182,7 +185,6 @@ module GeneValidator
182
185
  @validation_report.plot_files.push(plot1)
183
186
  @validation_report.run_time = Time.now - start
184
187
  @validation_report
185
-
186
188
  rescue NotEnoughHitsError
187
189
  @validation_report = ValidationReport.new('Not enough evidence',
188
190
  :warning, @short_header,
@@ -202,7 +204,7 @@ module GeneValidator
202
204
  :error, @short_header,
203
205
  @header, @description)
204
206
  @validation_report.errors.push 'Multiple reading frames Error'
205
- rescue
207
+ rescue StandardError
206
208
  @validation_report = ValidationReport.new('Unexpected error', :error,
207
209
  @short_header, @header,
208
210
  @description)
@@ -221,25 +223,25 @@ module GeneValidator
221
223
  # Output:
222
224
  # Array of +String+s, corresponding to the multiple aligned sequences
223
225
  # the prediction is the last sequence in the vector
224
- def multiple_align_mafft(prediction = @prediction, hits = @hits)
225
- fail unless prediction.is_a?(Query) && hits[0].is_a?(Query)
226
-
227
- options = ['--maxiterate', '1000', '--localpair', '--anysymbol',
228
- '--quiet', '--thread', "#{@num_threads}"]
229
- mafft = Bio::MAFFT.new('mafft', options)
230
- sequences = hits.map(&:raw_sequence)
226
+ def multiple_align_mafft(prediction, hits)
227
+ raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)
228
+
229
+ opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet',
230
+ '--thread', @num_threads.to_s]
231
+ mafft = Bio::MAFFT.new('mafft', opt)
232
+ sequences = hits.map do |h|
233
+ # remove the seq id - as MAFFT sometimes has an issue with this
234
+ f = Bio::FastaFormat.new(h.raw_sequence)
235
+ # check if fasta sequence otherwise returne original entry
236
+ f.seq.empty? ? h.raw_sequence : f.seq
237
+ end
231
238
  sequences.push(prediction.protein_translation)
232
239
 
233
240
  report = mafft.query_align(sequences)
234
- # Accesses the actual alignment.
235
- align = report.alignment
236
-
237
- align.each_with_index do |s, _i|
238
- @multiple_alignment.push(s.to_s)
239
- end
240
-
241
- @multiple_alignment
242
- rescue
241
+ alignment = report.alignment.map(&:to_s)
242
+ raise NoMafftInstallationError if alignment.empty?
243
+ alignment
244
+ rescue StandardError
243
245
  raise NoMafftInstallationError
244
246
  end
245
247
 
@@ -252,7 +254,7 @@ module GeneValidator
252
254
  # +ma+: array of +String+s, corresponding to the multiple aligned sequences
253
255
  # Output:
254
256
  # +String+ with the consensus regions
255
- def get_consensus(ma = @multiple_alignment)
257
+ def get_consensus(ma)
256
258
  align = Bio::Alignment.new(ma)
257
259
  align.consensus
258
260
  end
@@ -308,7 +310,7 @@ module GeneValidator
308
310
  # no of conserved residues among the hits
309
311
  no_conserved_residues = consensus.length - consensus.scan(/[\?-]/).length
310
312
 
311
- return 1 if no_conserved_residues == 0
313
+ return 1 if no_conserved_residues.zero?
312
314
 
313
315
  # no of conserved residues from the hita that appear in the prediction
314
316
  no_conserved_pred = consensus.split(//).each_index.count { |j| consensus[j] != '-' && consensus[j] != '?' && consensus[j] == prediction_raw[j] }
@@ -327,7 +329,7 @@ module GeneValidator
327
329
  # +String+ representing the statistical model
328
330
  # +Array+ with the maximum frequeny of the majoritary residue for each
329
331
  # position
330
- def get_sm_pssm(ma = @multiple_alignment, threshold = 0.7)
332
+ def get_sm_pssm(ma, threshold = 0.7)
331
333
  sm = ''
332
334
  freq = []
333
335
  (0..ma[0].length - 1).each do |i|
@@ -335,19 +337,10 @@ module GeneValidator
335
337
  ma.map { |seq| seq[i] }.each { |res| freqs[res] += 1 }
336
338
  # get the residue with the highest frequency
337
339
  max_freq = freqs.map { |_res, n| n }.max
338
- residue = (freqs.map { |res, n| n == max_freq ? res : [] }.flatten)[0]
339
-
340
- if residue == '-'
341
- freq.push(0)
342
- else
343
- freq.push(max_freq / (ma.length + 0.0))
344
- end
340
+ residue = freqs.map { |res, n| n == max_freq ? res : [] }.flatten[0]
345
341
 
346
- if max_freq / (ma.length + 0.0) >= threshold
347
- sm << residue
348
- else
349
- sm << '?'
350
- end
342
+ freq << residue == '-' ? 0 : (max_freq / ma.length.to_f)
343
+ sm += (max_freq / ma.length.to_f) >= threshold ? residue : '?'
351
344
  end
352
345
  [sm, freq]
353
346
  end
@@ -364,16 +357,12 @@ module GeneValidator
364
357
  gap_starts = seq.to_enum(:scan, /(-\w{1,#{len}}-)/i).map { |_m| $`.size + 1 }
365
358
  # remove isolated residues
366
359
  gap_starts.each do |i|
367
- (i..i + len - 1).each do |j|
368
- seq[j] = '-' if isalpha(seq[j])
369
- end
360
+ (i..i + len - 1).each { |j| seq[j] = '-' if isalpha(seq[j]) }
370
361
  end
371
362
  # remove isolated gaps
372
363
  res_starts = seq.to_enum(:scan, /([?\w]-{1,2}[?\w])/i).map { |_m| $`.size + 1 }
373
364
  res_starts.each do |i|
374
- (i..i + len - 1).each do |j|
375
- seq[j] = '?' if seq[j] == '-'
376
- end
365
+ (i..i + len - 1).each { |j| seq[j] = '?' if seq[j] == '-' }
377
366
  end
378
367
  seq
379
368
  end
@@ -390,10 +379,11 @@ module GeneValidator
390
379
  def array_to_ranges(ar)
391
380
  prev = ar[0]
392
381
 
393
- ranges = ar.slice_before { |e|
394
- prev, prev2 = e, prev
382
+ ranges = ar.slice_before do |e|
383
+ prev2 = prev
384
+ prev = e
395
385
  prev2 + 1 != e
396
- }.map { |a| a[0]..a[-1] }
386
+ end.map { |a| a[0]..a[-1] }
397
387
 
398
388
  ranges
399
389
  end
@@ -406,17 +396,16 @@ module GeneValidator
406
396
  # +ma+: +String+ array with the multiple alignmened hits and prediction
407
397
  def plot_alignment(freq, ma = @multiple_alignment)
408
398
  # get indeces of consensus in the multiple alignment
409
- consensus = get_consensus(@multiple_alignment[0..@multiple_alignment.length - 2])
399
+ consensus = get_consensus(ma[0..ma.length - 2])
410
400
  consensus_idxs = consensus.split(//).each_index.select { |j| isalpha(consensus[j]) }
411
401
  consensus_ranges = array_to_ranges(consensus_idxs)
412
402
 
413
- consensus_all = get_consensus(@multiple_alignment)
403
+ consensus_all = get_consensus(ma)
414
404
  consensus_all_idxs = consensus_all.split(//).each_index.select { |j| isalpha(consensus_all[j]) }
415
405
  consensus_all_ranges = array_to_ranges(consensus_all_idxs)
416
406
 
417
407
  match_alignment = ma[0..ma.length - 2].each_with_index.map { |seq, _j| seq.split(//).each_index.select { |j| isalpha(seq[j]) } }
418
- match_alignment_ranges = []
419
- match_alignment.each { |arr| match_alignment_ranges << array_to_ranges(arr) }
408
+ match_alignment_ranges = match_alignment.map { |e| array_to_ranges(e) }
420
409
 
421
410
  query_alignment = ma[ma.length - 1].split(//).each_index.select { |j| isalpha(ma[ma.length - 1][j]) }
422
411
  query_alignment_ranges = array_to_ranges(query_alignment)
@@ -425,15 +414,15 @@ module GeneValidator
425
414
 
426
415
  # plot statistical model
427
416
  data = freq.each_with_index.map { |h, j| { 'y' => ma.length, 'start' => j, 'stop' => j + 1, 'color' => 'orange', 'height' => h } } +
428
- # hits
429
- match_alignment_ranges.each_with_index.map { |ranges, j| ranges.map { |range| { 'y' => ma.length - j - 1, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } } }.flatten +
430
- ma[0..ma.length - 2].each_with_index.map { |_seq, j| consensus_ranges.map { |range| { 'y' => j + 1, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } } }.flatten +
431
- # plot prediction
432
- [{ 'y' => 0, 'start' => 0, 'stop' => len, 'color' => 'gray', 'height' => -1 }] +
433
- query_alignment_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } }.flatten +
434
-
435
- # plot consensus
436
- consensus_all_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } }.flatten
417
+ # hits
418
+ match_alignment_ranges.each_with_index.map { |ranges, j| ranges.map { |range| { 'y' => ma.length - j - 1, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } } }.flatten +
419
+ ma[0..ma.length - 2].each_with_index.map { |_seq, j| consensus_ranges.map { |range| { 'y' => j + 1, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } } }.flatten +
420
+ # plot prediction
421
+ [{ 'y' => 0, 'start' => 0, 'stop' => len, 'color' => 'gray', 'height' => -1 }] +
422
+ query_alignment_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } }.flatten +
423
+
424
+ # plot consensus
425
+ consensus_all_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } }.flatten
437
426
 
438
427
  y_axis_values = 'Prediction'
439
428
  (1..ma.length - 1).each { |i| y_axis_values << ", hit #{i}" }